diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index ac0229fad..f33b64246 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -10,9 +10,9 @@ on: - main schedule: - # At minute 0 past every 4nd hour. (see https://crontab.guru) + # At minute 0 past every 12th hour. (see https://crontab.guru) # this job is to keep the ccache cache warm - - cron: '0 */4 * * *' + - cron: '0 */12 * * *' concurrency: # A PR number if a pull request and otherwise the commit hash. This cancels diff --git a/build_tools/ci/cpu_comparison/run_test.py b/build_tools/ci/cpu_comparison/run_test.py index 6dc9dde3b..3ff4a05a5 100755 --- a/build_tools/ci/cpu_comparison/run_test.py +++ b/build_tools/ci/cpu_comparison/run_test.py @@ -643,25 +643,25 @@ def run(self, config): function_name="three_$mm$", ) - # Test(s) of the form matmul(A,B) where A:MxK, B:KxN - test_name = output_dir / "test_from_template.mlir" - template_name = matmul_template_dir / "matmul_MxK_KxN.mlir" - generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32") - aie_vs_llvm_cpu(config, test_name) - - # Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:N - test_name = output_dir / "test_from_template_bias_N.mlir" - template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir" - generate_matmul_test( - test_name, template_name, 1024, 1024, 512, "bf16", "f32" - ) - if config.vitis_dir: - aie_vs_llvm_cpu( - config, test_name, tile_pipeline="pack-peel", use_ukernel=True - ) + # Test(s) of the form matmul(A,B) where A:MxK, B:KxN + test_name = output_dir / "test_from_template.mlir" + template_name = matmul_template_dir / "matmul_MxK_KxN.mlir" + generate_matmul_test(test_name, template_name, 32, 32, 64, "bf16", "f32") + aie_vs_llvm_cpu(config, test_name) + + # Test(s) of the form matmul(A,B) + C where A:MxK, B:KxN, C:N + test_name = output_dir / "test_from_template_bias_N.mlir" + template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir" + generate_matmul_test( + test_name, template_name, 1024, 1024, 512, "bf16", "f32" + ) + if config.vitis_dir: aie_vs_llvm_cpu( - config, test_name, tile_pipeline="pack-peel", use_ukernel=False + config, test_name, tile_pipeline="pack-peel", use_ukernel=True ) + aie_vs_llvm_cpu( + config, test_name, tile_pipeline="pack-peel", use_ukernel=False + ) class SmokeSet(TestSet): diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp index 52871fb88..0fd4932ba 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp @@ -11,7 +11,6 @@ #include "Passes.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/STLExtras.h" -#include "mlir/Analysis/TopologicalSortUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -113,28 +112,11 @@ bool isJoin(ObjectFifoLinkOp op) { return op.getFifoIns().size() > 1; } bool isDistribute(ObjectFifoLinkOp op) { return op.getFifoOuts().size() > 1; } std::optional getOptionalSharedTile(ObjectFifoLinkOp op) { - if (isJoin(op)) { - auto fifoOut = getOutputObjectFifos(op)[0]; - for (auto fifoIn : getInputObjectFifos(op)) - if (fifoOut.getProducerTile() != fifoIn.getConsumerTiles()[0]) return {}; - return {fifoOut.getProducerTile()}; - } - - if (isDistribute(op)) { - auto fifoIn = getInputObjectFifos(op)[0]; - for (auto fifoOut : getOutputObjectFifos(op)) - if (fifoIn.getConsumerTiles()[0] != fifoOut.getProducerTile()) return {}; - return {fifoIn.getConsumerTiles()[0]}; - } - - auto fifoIn = getInputObjectFifos(op); - if (auto fifoOut = getOutputObjectFifos(op); - !fifoIn.empty() && !fifoOut.empty()) - for (auto consumerIn : fifoIn[0].getConsumerTiles()) - if (consumerIn == fifoOut[0].getProducerTile()) - return {fifoOut[0].getProducerTile()}; - return {}; + std::vector fifoOuts = getOutputObjectFifos(op); + assert(fifoOuts.size() > 0); + return fifoOuts[0].getProducerTile(); } + } // namespace class LockAnalysis { @@ -168,21 +150,7 @@ class DMAChannelAnalysis { DenseMap consumerChannelsPerTile; public: - DMAChannelAnalysis(DeviceOp &device) { - // go over the channels used for each tile and update the producer/consumer - // channel maps - for (auto memOp : device.getOps()) { - Region &r = memOp.getBody(); - auto tile = memOp.getTile(); - for (auto &bl : r.getBlocks()) { - for (auto op : bl.getOps()) { - static_cast(op.getChannelDir()) == DMAChannelDir::MM2S - ? getProducerDMAChannel(tile) - : getConsumerDMAChannel(tile); - } - } - } - } + DMAChannelAnalysis() {} /// Given an AIE tile, returns its next usable producer channel. SwitchDMAConnection getProducerDMAChannel(Value tile) { @@ -536,12 +504,6 @@ void replaceReleaseOp( DenseMap, std::vector> &releaseOps) { ObjectFifoCreateOp op = getObjectFifo(releaseOp); - auto core = releaseOp->getParentOfType(); - if (auto linkOp = getOptionalLinkOp(op)) - if (core.getTile() == *getOptionalSharedTile(*linkOp)) - llvm::report_fatal_error( - "currently cannot access objectFifo used in " - "ObjectFifoLinkOp"); auto port = releaseOp.getPort(); std::pair opPort = {op, static_cast(port)}; @@ -653,12 +615,7 @@ void replaceObjectAcquireOp( const DenseMap> &buffersPerFifo, DenseMap> &subviews) { ObjectFifoCreateOp op = getObjectFifo(acquireOp); - auto core = acquireOp->getParentOfType(); auto linkOp = getOptionalLinkOp(op); - if (linkOp && core.getTile() == *getOptionalSharedTile(*linkOp)) - llvm::report_fatal_error( - "currently cannot access objectFifo used in " - "ObjectFifoLinkOp"); // index of next element to acquire for this objectFifo // useful for keeping track of which @@ -995,7 +952,7 @@ struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass { void runOnOperation() override { DeviceOp device = getOperation(); LockAnalysis lockAnalysis(device); - DMAChannelAnalysis dmaAnalysis(device); + DMAChannelAnalysis dmaAnalysis; OpBuilder builder = OpBuilder::atBlockEnd(device.getBody()); // maps each objFifo to its corresponding buffer DenseMap> buffersPerFifo; @@ -1092,16 +1049,14 @@ struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass { } // Remove old ops - SetVector opsToErase; + IRRewriter rewriter(&getContext()); device.walk([&](Operation *op) { if (isa(op)) - opsToErase.insert(op); + ObjectFifoSubviewAccessOp, ObjectFifoReleaseOp>(op)) { + op->dropAllUses(); + rewriter.eraseOp(op); + } }); - topologicalSort(opsToErase); - IRRewriter rewriter(&getContext()); - for (auto it = opsToErase.rbegin(); it != opsToErase.rend(); ++it) - (*it)->erase(); } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir index 28ba3ef42..597a8c409 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir @@ -1,7 +1,7 @@ // RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s -// CHECK-LABEL: aie.device(npu1_4col) { +// CHECK-LABEL: aie.device(xcvc1902) { // CHECK: memref.global "public" @of2_cons : memref<16xi32> // CHECK: memref.global "public" @of2 : memref<16xi32> // CHECK: memref.global "public" @of1_cons : memref<16xi32> @@ -68,7 +68,7 @@ // CHECK: } module @link_AIE1 { - aie.device(npu1_4col) { + aie.device(xcvc1902) { %tile20 = aie.tile(2, 0) %tile12 = aie.tile(1, 2) %tile22 = aie.tile(2, 2) diff --git a/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir deleted file mode 100644 index 3818d0eea..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir +++ /dev/null @@ -1,145 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo_cons : memref<16xi32> -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK: %[[OBJFIFO_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0"} : memref<16xi32> -// CHECK: %[[OBJFIFO_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_1"} : memref<16xi32> -// CHECK: %[[OBJFIFO_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]], 0) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock"} -// CHECK: %[[OBJFIFO_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]], 1) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock"} -// CHECK: %[[OBJFIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_buff_0"} : memref<16xi32> -// CHECK: %[[OBJFIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_buff_1"} : memref<16xi32> -// CHECK: %[[OBJFIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]], 3) {init = 2 : i8, sym_name = "objfifo_prod_lock"} -// CHECK: %[[OBJFIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]], 4) {init = 0 : i8, sym_name = "objfifo_cons_lock"} -// CHECK: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]], 0) -// CHECK: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]], 1) -// CHECK: %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]], 2) -// CHECK: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 0) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2]], Acquire, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 0) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Acquire, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Release, 0) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb4, ^bb5) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Acquire, 0) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb5: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @tileDMA_channels { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile33 = aie.tile(3, 3) - %buff0 = aie.buffer(%tile12) : memref<16xi32> - %lock0 = aie.lock(%tile12, 0) - %buff1 = aie.buffer(%tile12) : memref<16xi32> - %lock1 = aie.lock(%tile12, 1) - %buff2 = aie.buffer(%tile12) : memref<16xi32> - %lock2 = aie.lock(%tile12, 2) - aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - } - aie.end - } - %mem12 = aie.mem(%tile12) { - %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: - aie.use_lock(%lock0, Acquire, 1) - aie.dma_bd(%buff0 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock0, Release, 0) - aie.next_bd ^bb2 - ^bb2: - aie.use_lock(%lock1, Acquire, 1) - aie.dma_bd(%buff1 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock1, Release, 0) - aie.next_bd ^bb1 - ^bb3: - %dma2 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: - aie.use_lock(%lock2, Acquire, 0) - aie.dma_bd(%buff2 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock2, Release, 1) - aie.next_bd ^bb4 - ^bb5: - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp new file mode 100644 index 000000000..285fa77c3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp @@ -0,0 +1,10 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h" + +/// Include the definitions of the logical-objFifo-like interfaces. +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h new file mode 100644 index 000000000..e34f129af --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h @@ -0,0 +1,16 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ +#define IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ + +#include "mlir/IR/OpImplementation.h" + +// clang-format off +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h.inc" +// clang-format on + +#endif // IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td new file mode 100644 index 000000000..4fbfc0a91 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td @@ -0,0 +1,37 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE +#define IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE + +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/CopyOpInterface.td" + +//===----------------------------------------------------------------------===// +// Defines the interface for logical objectFifo operations. +//===----------------------------------------------------------------------===// + +def LogicalObjFifoOpInterface : OpInterface<"LogicalObjFifoOpInterface"> { + let description = [{ + Interface for operations creating a logical objectFifo. + }]; + let cppNamespace = "mlir::iree_compiler::AMDAIE"; + + let methods = [ + InterfaceMethod< + /*desc=*/"Return the assigned tiles.", + /*retTy=*/"::mlir::OperandRange", + /*methodName=*/"getTiles", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return $_op.getTiles(); + }] + > + ]; +} + +#endif // IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index a6a65ac19..0c501fb3e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -522,16 +522,14 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_NpuDmaCpyNdOp //===----------------------------------------------------------------------===// -// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target -// and source BD IDs. -void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ArrayRef targetOffsets, - ArrayRef targetSizes, - ArrayRef targetStrides, - ArrayRef sourceOffsets, - ArrayRef sourceSizes, - ArrayRef sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { +// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and +// source BD IDs. +void NpuDmaCpyNdOp::build( + OpBuilder &b, OperationState &result, Value dma, Value target, + ArrayRef targetOffsets, ArrayRef targetSizes, + ArrayRef targetStrides, Value targetBdId, Value source, + ArrayRef sourceOffsets, ArrayRef sourceSizes, + ArrayRef sourceStrides, Value sourceBdId) { SmallVector staticTargetOffsets, staticTargetSizes, staticTargetStrides; SmallVector staticSourceOffsets, staticSourceSizes, @@ -552,22 +550,21 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, staticSourceSizes); dispatchIndexOpFoldResults(sourceStrides, dynamicSourceStrides, staticSourceStrides); - build(b, result, b.getIndexType(), dma, dynamicTargetOffsets, + build(b, result, b.getIndexType(), dma, target, dynamicTargetOffsets, dynamicTargetSizes, dynamicTargetStrides, staticTargetOffsets, - staticTargetSizes, staticTargetStrides, dynamicSourceOffsets, - dynamicSourceSizes, dynamicSourceStrides, staticSourceOffsets, - staticSourceSizes, staticSourceStrides, targetBdId, sourceBdId); + staticTargetSizes, staticTargetStrides, targetBdId, source, + dynamicSourceOffsets, dynamicSourceSizes, dynamicSourceStrides, + staticSourceOffsets, staticSourceSizes, staticSourceStrides, + sourceBdId); } // Build a NpuDmaCpyNdOp with static entries. -void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ArrayRef targetOffsets, - ArrayRef targetSizes, - ArrayRef targetStrides, - ArrayRef sourceOffsets, - ArrayRef sourceSizes, - ArrayRef sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { +void NpuDmaCpyNdOp::build( + OpBuilder &b, OperationState &result, Value dma, Value target, + ArrayRef targetOffsets, ArrayRef targetSizes, + ArrayRef targetStrides, mlir::Value targetBdId, Value source, + ArrayRef sourceOffsets, ArrayRef sourceSizes, + ArrayRef sourceStrides, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>( llvm::map_range(targetOffsets, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); @@ -592,17 +589,18 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, llvm::map_range(sourceStrides, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); - build(b, result, dma, targetOffsetValues, targetSizeValues, - targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues, targetBdId, sourceBdId); + build(b, result, dma, target, targetOffsetValues, targetSizeValues, + targetStrideValues, targetBdId, source, sourceOffsetValues, + sourceSizeValues, sourceStrideValues, sourceBdId); } // Build a NpuDmaCpyNdOp with dynamic entries. void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ValueRange targetOffsets, ValueRange targetSizes, - ValueRange targetStrides, ValueRange sourceOffsets, - ValueRange sourceSizes, ValueRange sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { + Value target, ValueRange targetOffsets, + ValueRange targetSizes, ValueRange targetStrides, + mlir::Value targetBdId, Value source, + ValueRange sourceOffsets, ValueRange sourceSizes, + ValueRange sourceStrides, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>(llvm::map_range( targetOffsets, [](Value v) -> OpFoldResult { return v; })); @@ -619,9 +617,212 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, SmallVector sourceStrideValues = llvm::to_vector<4>(llvm::map_range( sourceStrides, [](Value v) -> OpFoldResult { return v; })); - build(b, result, dma, targetOffsetValues, targetSizeValues, - targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues, targetBdId, sourceBdId); + build(b, result, dma, target, targetOffsetValues, targetSizeValues, + targetStrideValues, targetBdId, source, sourceOffsetValues, + sourceSizeValues, sourceStrideValues, sourceBdId); +} + +void NpuDmaCpyNdOp::print(OpAsmPrinter &p) { + Operation *op = getOperation(); + p << " " << getDma() << "("; + if (getTarget()) p << getTarget(); + printDynamicIndexList(p, op, getTargetOffsets(), getTargetStaticOffsets()); + p << " "; + printDynamicIndexList(p, op, getTargetSizes(), getTargetStaticSizes()); + p << " "; + printDynamicIndexList(p, op, getTargetStrides(), getTargetStaticStrides()); + if (getTargetBdId()) p << " bd_id = " << getTargetBdId(); + p << ", "; + if (getSource()) p << getSource(); + printDynamicIndexList(p, op, getSourceOffsets(), getSourceStaticOffsets()); + p << " "; + printDynamicIndexList(p, op, getSourceSizes(), getSourceStaticSizes()); + p << " "; + printDynamicIndexList(p, op, getSourceStrides(), getSourceStaticStrides()); + if (getSourceBdId()) p << " bd_id = " << getSourceBdId(); + p << ")"; + SmallVector elidedAttrs; + elidedAttrs.push_back("operandSegmentSizes"); + elidedAttrs.push_back("target_static_offsets"); + elidedAttrs.push_back("target_static_sizes"); + elidedAttrs.push_back("target_static_strides"); + elidedAttrs.push_back("source_static_offsets"); + elidedAttrs.push_back("source_static_sizes"); + elidedAttrs.push_back("source_static_strides"); + p.printOptionalAttrDictWithKeyword(op->getAttrs(), elidedAttrs); + if (getTarget() || getSource()) p << " :"; + if (getTarget()) p << " target_type = " << getTarget().getType(); + if (getSource()) p << " source_type = " << getSource().getType(); +} + +ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) { + OpBuilder b(parser.getContext()); + auto indexType = b.getIndexType(); + + SMLoc targetOperandsLoc, sourceOperandsLoc; + OpAsmParser::UnresolvedOperand dma; + SmallVector targetOperands, sourceOperands, + targetBdIdOperands, sourceBdIdOperands; + DenseI64ArrayAttr targetStaticOffsets, targetStaticSizes, targetStaticStrides; + SmallVector targetDynamicOffsets, + targetDynamicSizes, targetDynamicStrides; + DenseI64ArrayAttr sourceStaticOffsets, sourceStaticSizes, sourceStaticStrides; + SmallVector sourceDynamicOffsets, + sourceDynamicSizes, sourceDynamicStrides; + SmallVector targetTypes; + SmallVector sourceTypes; + + if (failed(parser.parseOperand(dma)) || failed(parser.parseLParen())) + return failure(); + + OpAsmParser::UnresolvedOperand target; + if (parser.parseOptionalOperand(target).has_value()) { + targetOperands.push_back(target); + } + if (failed(parseDynamicIndexList(parser, targetDynamicOffsets, + targetStaticOffsets))) { + return failure(); + } + result.getOrAddProperties().target_static_offsets = + targetStaticOffsets; + if (failed(parseDynamicIndexList(parser, targetDynamicSizes, + targetStaticSizes))) { + return failure(); + } + result.getOrAddProperties().target_static_sizes = + targetStaticSizes; + if (failed(parseDynamicIndexList(parser, targetDynamicStrides, + targetStaticStrides))) { + return failure(); + } + result.getOrAddProperties().target_static_strides = + targetStaticStrides; + + if (succeeded(parser.parseOptionalKeyword("bd_id"))) { + if (failed(parser.parseEqual())) return failure(); + OpAsmParser::UnresolvedOperand bdId; + if (failed(parser.parseOperand(bdId))) return failure(); + targetBdIdOperands.push_back(bdId); + } + + if (failed(parser.parseComma())) return failure(); + + OpAsmParser::UnresolvedOperand source; + if (parser.parseOptionalOperand(source).has_value()) { + sourceOperands.push_back(source); + } + if (failed(parseDynamicIndexList(parser, sourceDynamicOffsets, + sourceStaticOffsets))) { + return failure(); + } + result.getOrAddProperties().source_static_offsets = + sourceStaticOffsets; + if (failed(parseDynamicIndexList(parser, sourceDynamicSizes, + sourceStaticSizes))) { + return failure(); + } + result.getOrAddProperties().source_static_sizes = + sourceStaticSizes; + if (failed(parseDynamicIndexList(parser, sourceDynamicStrides, + sourceStaticStrides))) { + return failure(); + } + result.getOrAddProperties().source_static_strides = + sourceStaticStrides; + + if (succeeded(parser.parseOptionalKeyword("bd_id"))) { + if (failed(parser.parseEqual())) return failure(); + OpAsmParser::UnresolvedOperand bdId; + if (failed(parser.parseOperand(bdId))) return failure(); + sourceBdIdOperands.push_back(bdId); + } + + if (failed(parser.parseRParen())) return failure(); + { + auto loc = parser.getCurrentLocation(); + if (parser.parseOptionalAttrDict(result.attributes)) return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) { + return failure(); + } + } + + if (succeeded(parser.parseOptionalColon())) { + if (succeeded(parser.parseOptionalKeyword("target_type"))) { + if (parser.parseEqual()) return failure(); + Type targetType; + if (failed(parser.parseType(targetType))) return failure(); + targetTypes.push_back(targetType); + } + if (succeeded(parser.parseOptionalKeyword("source_type"))) { + if (parser.parseEqual()) return failure(); + Type sourceType; + if (failed(parser.parseType(sourceType))) return failure(); + sourceTypes.push_back(sourceType); + } + } + + llvm::copy( + ArrayRef({1, static_cast(targetOperands.size()), + static_cast(targetDynamicOffsets.size()), + static_cast(targetDynamicSizes.size()), + static_cast(targetDynamicStrides.size()), + static_cast(targetBdIdOperands.size()), + static_cast(sourceOperands.size()), + static_cast(sourceDynamicOffsets.size()), + static_cast(sourceDynamicSizes.size()), + static_cast(sourceDynamicStrides.size()), + static_cast(sourceBdIdOperands.size())}), + result.getOrAddProperties() + .operandSegmentSizes.begin()); + + if (failed(parser.resolveOperand(dma, indexType, result.operands))) + return failure(); + if (failed(parser.resolveOperands(targetOperands, targetTypes, + targetOperandsLoc, result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicOffsets, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicSizes, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicStrides, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetBdIdOperands, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceOperands, sourceTypes, + sourceOperandsLoc, result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicOffsets, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicSizes, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicStrides, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceBdIdOperands, indexType, + result.operands))) { + return failure(); + } + + result.addTypes(indexType); + return success(); } DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( @@ -634,14 +835,15 @@ DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( ::llvm::SmallVector &newSourceStrides) { Location loc = (*this)->getLoc(); auto newOp = rewriter.create( - loc, getDma(), + loc, getDma(), getTarget(), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetOffsets), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetSizes), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetStrides), + getTargetBdId(), getSource(), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceOffsets), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceSizes), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceStrides), - getTargetBdId(), getSourceBdId()); + getSourceBdId()); return cast(newOp.getOperation()); } @@ -660,8 +862,9 @@ struct NpuDmaCpyNdOpReplacementBuilder { ArrayRef srcMixedSizes, ArrayRef srcMixedStrides) { rewriter.replaceOpWithNewOp( - dmaOp, dmaOp.getDma(), tgtMixedOffsets, tgtMixedSizes, tgtMixedStrides, - srcMixedOffsets, srcMixedSizes, srcMixedStrides, dmaOp.getTargetBdId(), + dmaOp, dmaOp.getDma(), dmaOp.getTarget(), tgtMixedOffsets, + tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(), + dmaOp.getSource(), srcMixedOffsets, srcMixedSizes, srcMixedStrides, dmaOp.getSourceBdId()); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h index 5a587ef0e..b3cafdbf7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h @@ -7,6 +7,10 @@ #ifndef IREE_COMPILER_AMDAIE_OPS_H_ #define IREE_COMPILER_AMDAIE_OPS_H_ +#include "iree-amd-aie/IR/AMDAIEAttrs.h" +#include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h" +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h" +#include "iree-amd-aie/IR/AMDAIETypes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" @@ -14,10 +18,6 @@ #include "mlir/Interfaces/CopyOpInterface.h" #include "mlir/Interfaces/ViewLikeInterface.h" -#include "iree-amd-aie/IR/AMDAIEAttrs.h" -#include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h" -#include "iree-amd-aie/IR/AMDAIETypes.h" - // clang-format off #include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 763d3a165..0f46400da 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -18,6 +18,7 @@ include "iree-amd-aie/IR/AMDAIEAttrs.td" include "iree-amd-aie/aie_runtime/AMDAIEEnums.td" include "iree-amd-aie/IR/AMDAIEDialect.td" include "iree-amd-aie/IR/AMDAIEDmaOpInterface.td" +include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td" include "iree-amd-aie/IR/AMDAIETypes.td" //===----------------------------------------------------------------------===// @@ -208,8 +209,8 @@ def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [ // IREE AMDAIE Npu Ops //===----------------------------------------------------------------------===// -def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", - [AttrSizedOperandSegments, DoublyStridedOpInterface]>, +def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ + AttrSizedOperandSegments, DoublyStridedOpInterface]>, Results<(outs Index)> { let summary = "The Npu uController's dma operator"; let description = [{ @@ -239,58 +240,50 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", let arguments = ( ins Index:$dma, + Optional:$target, Variadic:$target_offsets, Variadic:$target_sizes, Variadic:$target_strides, DenseI64ArrayAttr:$target_static_offsets, DenseI64ArrayAttr:$target_static_sizes, DenseI64ArrayAttr:$target_static_strides, + Optional:$target_bd_id, + Optional:$source, Variadic:$source_offsets, Variadic:$source_sizes, Variadic:$source_strides, DenseI64ArrayAttr:$source_static_offsets, DenseI64ArrayAttr:$source_static_sizes, DenseI64ArrayAttr:$source_static_strides, - Optional:$target_bd_id, Optional:$source_bd_id ); - let assemblyFormat = [{ - $dma - `(` - custom($target_offsets, $target_static_offsets) - custom($target_sizes, $target_static_sizes) - custom($target_strides, $target_static_strides) - (`bd_id` `=` $target_bd_id^)? - `,` - custom($source_offsets, $source_static_offsets) - custom($source_sizes, $source_static_sizes) - custom($source_strides, $source_static_strides) - (`bd_id` `=` $source_bd_id^)? - `)` - attr-dict - }]; + // Use a custom assembly format because of weird spaces being inserted around + // the optional `target` by the default assembly format generator. + let hasCustomAssemblyFormat = 1; let builders = [ // Build a NpuDmaCpyNdOp with mixed static and dynamic entries. - OpBuilder<(ins "Value":$dma, "ArrayRef":$target_offsets, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ArrayRef":$target_offsets, "ArrayRef":$target_sizes, - "ArrayRef":$target_strides, - "ArrayRef":$source_offsets, + "ArrayRef":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, - "::mlir::Value":$source_bd_id)>, + "ArrayRef":$source_strides, "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with static entries. - OpBuilder<(ins "Value":$dma, "ArrayRef":$target_offsets, - "ArrayRef":$target_sizes, "ArrayRef":$target_strides, - "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ArrayRef":$target_offsets, "ArrayRef":$target_sizes, + "ArrayRef":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ArrayRef":$source_offsets, + "ArrayRef":$source_sizes, "ArrayRef":$source_strides, "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with dynamic entries. - OpBuilder<(ins "Value":$dma, "ValueRange":$target_offsets, - "ValueRange":$target_sizes, "ValueRange":$target_strides, - "ValueRange":$source_offsets, "ValueRange":$source_sizes, - "ValueRange":$source_strides, "::mlir::Value":$target_bd_id, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ValueRange":$target_offsets, "ValueRange":$target_sizes, + "ValueRange":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ValueRange":$source_offsets, + "ValueRange":$source_sizes, "ValueRange":$source_strides, "::mlir::Value":$source_bd_id)> ]; @@ -523,7 +516,8 @@ def AMDAIE_LogicalObjectFifoAcquire: } def AMDAIE_LogicalObjectFifoFromMemrefOp - : AMDAIE_Op<"logicalobjectfifo.from_memref", [Pure]> { + : AMDAIE_Op<"logicalobjectfifo.from_memref", + [LogicalObjFifoOpInterface, Pure]> { let summary = "Create a logical objectFifo from a memref"; let description = [{ Creates a logical objectFifo which encapsulates a memref. The logical objectFifo @@ -631,6 +625,46 @@ def AMDAIE_LogicalObjectFifoLink }]; } +def AMDAIE_LogicalObjectFifoPlaceholderOp: + AMDAIE_Op<"logicalobjectfifo.placeholder", [ + LogicalObjFifoOpInterface, Pure]> { + let summary = "A placeholder for a logical objectFifo."; + let description = [{ + Represents a placeholder for a logical objectFifo. The actual logical + objectFifo can then be provided later. This is useful for creating static + connections (`amdaie.circular_dma_cpy_nd`) that can be reused for different + logical objectFifos. + + Example: + ```mlir + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) + binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024xi32> + %alloc = memref.alloc() : memref<1024xi32, 1 : i32> + %obj0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_1} + : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %ph = amdaie.logicalobjectfifo.placeholder{} + : !amdaie.logicalobjectfifo> + %connection = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %ph[] [] []) + : (!amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %obj1 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} + : memref<1024xi32> -> !amdaie.logicalobjectfifo> + %npu_dma = amdaie.npu.dma_cpy_nd %connection([] [] [], + %obj0[%c0, %c32] [%c32, %c32] [%c32, %c1]) + : source_type = !amdaie.logicalobjectfifo> + amdaie.end + } + ``` + }]; + + let arguments = (ins Variadic:$tiles); + + let results = (outs AnyAMDAIELogicalObjectFifoType:$output); + + let assemblyFormat = [{ `{` $tiles `}` attr-dict `:` type($output)}]; +} + def AMDAIE_LogicalObjectFifoRelease: AMDAIE_Op<"logicalobjectfifo.release", []> { let summary = "Semaphore operation to release objects from a logical" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt index 11d57f637..9e07f4f03 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt @@ -14,6 +14,7 @@ iree_cc_library( HDRS "AMDAIEAttrs.h" "AMDAIEDialect.h" + "AMDAIELogicalObjFifoOpInterface.h" "AMDAIEOps.h" "AMDAIETypes.h" TEXTUAL_HDRS @@ -29,15 +30,19 @@ iree_cc_library( "AMDAIEOps.h.inc" "AMDAIEDmaOpInterface.cpp.inc" "AMDAIEDmaOpInterface.h.inc" + "AMDAIELogicalObjFifoOpInterface.h.inc" + "AMDAIELogicalObjFifoOpInterface.cpp.inc" SRCS "AMDAIEAttrs.cpp" "AMDAIEDmaOpInterface.cpp" "AMDAIEDialect.cpp" + "AMDAIELogicalObjFifoOpInterface.cpp" "AMDAIEOps.cpp" "AMDAIETypes.cpp" DEPS ::AMDAIEDialectGen ::AMDAIEDmaOpInterfaceGen + ::AMDAIELogicalObjFifoOpInterface ::AMDAIEOpsGen ::AMDAIETypesGen ::AMDAIEAttrsGen @@ -104,3 +109,13 @@ iree_tablegen_library( --gen-op-interface-decls AMDAIEDmaOpInterface.h.inc --gen-op-interface-defs AMDAIEDmaOpInterface.cpp.inc ) + +iree_tablegen_library( + NAME + AMDAIELogicalObjFifoOpInterface + TD_FILE + "AMDAIELogicalObjFifoOpInterface.td" + OUTS + --gen-op-interface-decls AMDAIELogicalObjFifoOpInterface.h.inc + --gen-op-interface-defs AMDAIELogicalObjFifoOpInterface.cpp.inc +) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index d5d85ede0..e740db7c3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -163,6 +163,22 @@ func.func @logicalobjectfifo_link(%arg0: !amdaie.logicalobjectfifo> +// CHECK: %{{.+}} = amdaie.logicalobjectfifo.placeholder{%[[tile_0_0]]} : !amdaie.logicalobjectfifo> +func.func @logicalobjectfifo_placeholder() { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + %placeholder_0 = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> + %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + return +} + + // ----- // CHECK-LABEL: func.func @logicalobjectfifo_release @@ -265,6 +281,45 @@ func.func @npu_dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo>, %[[ARG1:.+]]: !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd %[[DMA0]](%[[ARG0]][] [] [], %[[ARG1]][] [] []) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> +func.func @npu_dma_cpy_nd_target_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0(%arg0[] [] [], %arg1[] [] []) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> + return +} + +// ----- + +// CHECK-LABEL: func.func @npu_dma_cpy_nd_all_operands +// CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo>, %[[ARG1:.+]]: !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd %[[DMA0]] +// CHECK-SAME: %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C128]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]] +// CHECK-SAME: %[[ARG1]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C16]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]] +// CHECK-SAME: : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> +func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %tile = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile, 0) + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0(%arg0[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c128, %c16, 1] bd_id = %bd_id, %arg1[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c16, %c16, 1] bd_id = %bd_id) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> + return +} + +// ----- + // CHECK-LABEL: func.func @workgroup // CHECK: amdaie.workgroup // CHECK: amdaie.core diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp index 5cbebf39e..dccbd101f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp @@ -23,43 +23,15 @@ static void writeLDScriptMap(raw_ostream &output, BufferOp buf, int offset) { output << ". += 0x" << llvm::utohexstr(numBytes) << ";\n"; } -///// ld.script format: -// -// MEMORY -// { -// program (RX) : ORIGIN = 0, LENGTH = 0x0020000 -// data (!RX) : ORIGIN = 0x20000, LENGTH = 0x0020000 -// } -// ENTRY(_main_init) -// INPUT(something.o) -// SECTIONS -// { -// . = 0x0; -// .text : { -// // the __start symbol from crt0.o has to come at address zero. -// *crt0.o(.text) -// . = 0x200; -// *(.text) -// } > program -// .data : { *(.data) } > data -// . = 0x20000; -// _sp_start_value_DM_stack = .; -// . = 0x24000; -// a = .; -// . += 1024; -// .bss : { *(.bss) } > data -// } LogicalResult mlir::iree_compiler::AMDAIE::AIETranslateToLdScript( DeviceOp deviceOp, raw_ostream &output, int tileCol, int tileRow) { DenseMap tiles; DenseMap> buffers; - collectTiles(deviceOp, tiles); ::collectBuffers(deviceOp, buffers); - AMDAIEDeviceModel deviceModel = - getDeviceModel(static_cast(deviceOp.getDevice())); + AMDAIEDeviceModel deviceModel = getDeviceModel(deviceOp.getDevice()); for (auto tile : deviceOp.getOps()) if (tile.getCol() == tileCol && tile.getRow() == tileRow) { TileLoc srcCoord = {tile.getCol(), tile.getRow()}; @@ -95,9 +67,33 @@ SECTIONS *(.data*); *(.rodata*) } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .tctmemtab : { + *(.tctmemtab) + } + .rtstab : { + *(.rtstab) + } + .eoltab : { + *(.eoltab) + } + .chesstypeannotationtab : { + *(.chesstypeannotationtab) + } )THESCRIPT"; auto doBuffer = [&](std::optional tile, int offset, - const std::string& dir) { + const std::string &dir) { if (tile) { if (tiles.count({tile->col, tile->row})) for (auto buf : buffers[tiles[{tile->col, tile->row}]]) @@ -132,7 +128,6 @@ SECTIONS deviceModel.getMemEastBaseAddress(), std::string("east")); output << " .bss : { *(.bss) } > data\n"; - output << " .bss.DMb.4 : { *(.bss.DMb.4) } > data\n"; output << "}\n"; if (auto coreOp = getCoreOp(tile)) { output << "PROVIDE(main = core_" << std::to_string(tile.getCol()) << "_" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index cb84b85c3..6eb1a2671 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -632,6 +632,7 @@ static LogicalResult generateCoreElfFiles( } flags.emplace_back("--target=" + targetLower + "-none-unknown-elf"); flags.emplace_back("-Wl,--gc-sections"); + flags.emplace_back("-Wl,--orphan-handling=error"); flags.emplace_back("-Wl,-T," + ldscriptPath.string()); flags.emplace_back("-o"); flags.emplace_back(elfFile.string()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp index 3e683d50d..7ef24032d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp @@ -64,9 +64,14 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); WalkResult res = controlCodeOp->walk([&](Operation *op) { if (auto npuDmaOp = dyn_cast(op)) { - AMDAIE::CircularDmaCpyNdOp inputDma = npuDmaOp.getDmaCpyNdOp(); - if (npuDmaOp.getSourceMemorySpaceAsUInt() == 0) { - SmallVector tiles = inputDma.getSourceObjectFifo().getTiles(); + if (npuDmaOp.getSource()) { + auto logicalObjFifo = dyn_cast( + npuDmaOp.getSource().getDefiningOp()); + if (!logicalObjFifo) { + npuDmaOp.emitOpError() << "expected a source logical objectFifo"; + return WalkResult::interrupt(); + } + SmallVector tiles = logicalObjFifo.getTiles(); AMDAIE::TileOp tileOp; if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) return WalkResult::interrupt(); @@ -83,13 +88,22 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { tileOp, bdId.value()); rewriter.setInsertionPoint(npuDmaOp); npuDmaOp = rewriter.replaceOpWithNewOp( - npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), - npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), - npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), - npuDmaOp.getSourceMixedStrides(), npuDmaOp.getTargetBdId(), bdIdOp); + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTarget(), + npuDmaOp.getTargetMixedOffsets(), npuDmaOp.getTargetMixedSizes(), + npuDmaOp.getTargetMixedStrides(), npuDmaOp.getTargetBdId(), + npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), + npuDmaOp.getSourceMixedSizes(), npuDmaOp.getSourceMixedStrides(), + bdIdOp); } - if (npuDmaOp.getTargetMemorySpaceAsUInt() == 0) { - SmallVector tiles = inputDma.getTargetObjectFifo().getTiles(); + if (npuDmaOp.getTarget()) { + auto logicalObjFifo = dyn_cast( + npuDmaOp.getTarget().getDefiningOp()); + if (!logicalObjFifo) { + npuDmaOp.emitOpError() + << "expected a target `amdaie.logicalobjectfifo.from_memref`"; + return WalkResult::interrupt(); + } + SmallVector tiles = logicalObjFifo.getTiles(); AMDAIE::TileOp tileOp; if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) return WalkResult::interrupt(); @@ -106,10 +120,11 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { tileOp, bdId.value()); rewriter.setInsertionPoint(npuDmaOp); (void)rewriter.replaceOpWithNewOp( - npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), - npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTarget(), + npuDmaOp.getTargetMixedOffsets(), npuDmaOp.getTargetMixedSizes(), + npuDmaOp.getTargetMixedStrides(), bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), - npuDmaOp.getSourceMixedStrides(), bdIdOp, npuDmaOp.getSourceBdId()); + npuDmaOp.getSourceMixedStrides(), npuDmaOp.getSourceBdId()); } return WalkResult::advance(); } else if (auto npuWaitOp = dyn_cast(op)) { @@ -154,7 +169,7 @@ class AMDAIEAssignNpuDmaBdIdsPass } AMDAIEAssignNpuDmaBdIdsPass() = default; - AMDAIEAssignNpuDmaBdIdsPass(const AMDAIEAssignNpuDmaBdIdsPass &pass){}; + AMDAIEAssignNpuDmaBdIdsPass(const AMDAIEAssignNpuDmaBdIdsPass &pass) {}; void runOnOperation() override; }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp index c446df605..6ed1e4777 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp @@ -114,14 +114,28 @@ LogicalResult WorkgroupBuilder::buildForDmaCpyNdOp( SmallVector npuDmaSourceOffsets = dmaOp.getSourceMixedOffsets(); SmallVector npuDmaSourceSizes = dmaOp.getSourceMixedSizes(); SmallVector npuDmaSourceStrides = dmaOp.getSourceMixedStrides(); + Value circularDmaTarget, circularDmaSource, npuDmaTarget, npuDmaSource; if (!sourceMemSpace) { // Check if the source of DmaCpyNd op is from L3 - then source addressing // will be controlled by the uController and target addressing will stay in // the circular DMA to be part of the AIE configuration. + auto logicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!logicalObjFifo) { + return dmaOp.emitOpError() + << "`amdaie.logicalobjectfifo.from_memref` expected as source"; + } + auto type = cast(dmaOp.getSource().getType()); + auto placeholder = + rewriter.createAndLookup( + rewriter.getUnknownLoc(), type, logicalObjFifo.getTiles()); + circularDmaSource = placeholder.getResult(); + circularDmaTarget = dmaOp.getTarget(); circularDmaTargetOffsets = npuDmaTargetOffsets; circularDmaTargetSizes = npuDmaTargetSizes; circularDmaTargetStrides = npuDmaTargetStrides; + npuDmaSource = dmaOp.getSource(); npuDmaTargetOffsets = empty; npuDmaTargetSizes = empty; npuDmaTargetStrides = empty; @@ -129,26 +143,40 @@ LogicalResult WorkgroupBuilder::buildForDmaCpyNdOp( // Check if the target of DmaCpyNd op is from L3 - then target addressing // will be controlled by the uController and source addressing will stay in // the circular DMA to be part of the AIE configuration. + auto logicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!logicalObjFifo) { + return dmaOp.emitOpError() + << "`amdaie.logicalobjectfifo.from_memref` expected as source"; + } + auto type = cast(dmaOp.getTarget().getType()); + auto placeholder = + rewriter.createAndLookup( + rewriter.getUnknownLoc(), type, logicalObjFifo.getTiles()); + circularDmaSource = dmaOp.getSource(); + circularDmaTarget = placeholder.getResult(); circularDmaSourceOffsets = npuDmaSourceOffsets; circularDmaSourceSizes = npuDmaSourceSizes; circularDmaSourceStrides = npuDmaSourceStrides; + npuDmaTarget = dmaOp.getTarget(); npuDmaSourceOffsets = empty; npuDmaSourceSizes = empty; npuDmaSourceStrides = empty; } auto newDmaOp = rewriter.createAndMap( - rewriter.getUnknownLoc(), dmaOp, dmaOp.getTarget(), + rewriter.getUnknownLoc(), dmaOp, circularDmaTarget, circularDmaTargetOffsets, circularDmaTargetSizes, - circularDmaTargetStrides, dmaOp.getSource(), circularDmaSourceOffsets, + circularDmaTargetStrides, circularDmaSource, circularDmaSourceOffsets, circularDmaSourceSizes, circularDmaSourceStrides); IRRewriter::InsertPoint dmaInsertionPoint = rewriter.saveInsertionPoint(); controlCodeRewriter.setInsertionPoint(controlCode, controlCodeEnd); auto npuDmaCpy = controlCodeRewriter.createAndLookup( - loc, newDmaOp.getResult(), npuDmaTargetOffsets, npuDmaTargetSizes, - npuDmaTargetStrides, npuDmaSourceOffsets, npuDmaSourceSizes, - npuDmaSourceStrides, nullptr, nullptr); + loc, newDmaOp.getResult(), npuDmaTarget, npuDmaTargetOffsets, + npuDmaTargetSizes, npuDmaTargetStrides, /*target_bd_id=*/nullptr, + npuDmaSource, npuDmaSourceOffsets, npuDmaSourceSizes, npuDmaSourceStrides, + /*source_bd_id=*/nullptr); DMAChannelDir direction = !sourceMemSpace ? DMAChannelDir::MM2S : DMAChannelDir::S2MM; controlCodeRewriter.createAndLookup( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp index f9225e612..cfd347313 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp @@ -88,15 +88,15 @@ LogicalResult createLogicalObjectFifoLink( auto sourceLogicalObjectFifo = dyn_cast( stridedOp.getSource().getDefiningOp()); - if (!sourceLogicalObjectFifo) { - stridedOp->emitError( - "does not have a `LogicalObjectFifoFromMemrefOp` as source"); - return failure(); - } if (!lastUserOp || lastUserOp->isBeforeInBlock(stridedOp)) { lastUserOp = stridedOp; } - if (logicalObjectFifo == sourceLogicalObjectFifo) { + // The `sourceLogicalObjectFifo` could be either a + // `LogicalObjectFifoFromMemrefOp` or `LogicalObjectFifoPlaceholderOp`, + // but currently the linking only works with + // `LogicalObjectFifoFromMemrefOp` on L2. + if (sourceLogicalObjectFifo && + logicalObjectFifo == sourceLogicalObjectFifo) { if (std::optional offset = stridedOp.getSourceStaticBaseOffset()) { outs.push_back(std::make_pair(stridedOp, offset.value())); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEHoistLogicalObjFifo.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEHoistLogicalObjFifo.cpp new file mode 100644 index 000000000..760dd2418 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEHoistLogicalObjFifo.cpp @@ -0,0 +1,62 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +#define DEBUG_TYPE "iree-amdaie-hoist-logical-objectfifo" + +namespace mlir::iree_compiler::AMDAIE { + +/// Hoist logical objectFifo operations until one of the operands is located +/// within the same scope. +LogicalResult hoistLogicalObjFifoOp(RewriterBase &rewriter, + AMDAIE::LogicalObjectFifoFromMemrefOp op) { + Operation *ancestorOp = op; + while (ancestorOp) { + Operation *newAncestorOp = ancestorOp->getParentOp(); + if (llvm::any_of(op->getOperands(), [&](Value operand) { + return operand.getDefiningOp() && + newAncestorOp->isProperAncestor(operand.getDefiningOp()); + })) { + break; + } + if (isa( + newAncestorOp)) { + break; + } + ancestorOp = newAncestorOp; + } + if (ancestorOp && ancestorOp != op) rewriter.moveOpBefore(op, ancestorOp); + return failure(); +} + +namespace { +struct AMDAIEHoistLogicalObjFifoPass + : public impl::AMDAIEHoistLogicalObjFifoBase< + AMDAIEHoistLogicalObjFifoPass> { + void runOnOperation() override; +}; + +void AMDAIEHoistLogicalObjFifoPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + + SmallVector logicalObjFifos; + parentOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp op) { + (void)hoistLogicalObjFifoOp(rewriter, op); + }); +} + +} // namespace + +std::unique_ptr createAMDAIEHoistLogicalObjFifoPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 6fccc017d..284b297c9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -460,8 +460,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, int &dmaId) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n"); rewriter.setInsertionPointToEnd(deviceBlock); + if (!dmaOp.getSource()) return dmaOp.emitOpError() << "expected a source"; + auto sourceLogicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!sourceLogicalObjFifo) + return dmaOp.emitOpError() << "expected a logical objectFifo source"; SmallVector newSourceTiles = - llvm::map_to_vector(dmaOp.getSourceObjectFifo().getTiles(), + llvm::map_to_vector(sourceLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); if (newSourceTiles.size() != 1) { return dmaOp.emitError() @@ -469,8 +474,14 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, "`ObjectFifoCreateOp` only handles a single source tile for now."; } Value newSourceTile = newSourceTiles[0]; + + if (!dmaOp.getTarget()) return dmaOp.emitOpError() << "expected a source"; + auto targetLogicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!targetLogicalObjFifo) + return dmaOp.emitOpError() << "expected a logical objectFifo source"; SmallVector newTargetTiles = - llvm::map_to_vector(dmaOp.getTargetObjectFifo().getTiles(), + llvm::map_to_vector(targetLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); auto symName = "obj" + std::to_string(dmaId++); @@ -563,7 +574,13 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, IRMapping &mapper, IRMapping &bindingsMapper) { rewriter.setInsertionPoint(dmaOp); // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. - if (dmaOp.getSourceMemorySpaceAsUInt() == 0) { + if (dmaOp.getSource()) { + auto sourceLogicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!sourceLogicalObjFifo) { + return dmaOp.emitOpError() << "expected source to be an " + "`amdaie.logicalobjectfifo.from_memref`"; + } if (!dmaOp.hasSourceAddressing()) { return dmaOp.emitOpError() << "expected source addressing for DMA with source on L3"; @@ -590,8 +607,7 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, } AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = - bindingsMapper.lookup(dmaCpyNd.getSourceObjectFifo().getMemref()); + Value memref = bindingsMapper.lookup(sourceLogicalObjFifo.getMemref()); auto objFifo = dyn_cast( mapper.lookup(dmaCpyNd.getOperation())); if (!objFifo) { @@ -604,7 +620,13 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, empty, empty, staticOffsets, staticSizes, staticStrides, objFifo.getName(), bdIdOp.getValue(), issueToken); } - if (dmaOp.getTargetMemorySpaceAsUInt() == 0) { + if (dmaOp.getTarget()) { + auto targetLogicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!targetLogicalObjFifo) { + return dmaOp.emitOpError() << "expected target to be an " + "`amdaie.logicalobjectfifo.from_memref`"; + } if (!dmaOp.hasTargetAddressing()) { return dmaOp.emitOpError() << "expected target addressing for DMA with target on L3"; @@ -631,8 +653,7 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, } AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = - bindingsMapper.lookup(dmaCpyNd.getTargetObjectFifo().getMemref()); + Value memref = bindingsMapper.lookup(targetLogicalObjFifo.getMemref()); auto objFifo = dyn_cast( mapper.lookup(dmaCpyNd.getOperation())); if (!objFifo) { @@ -786,7 +807,7 @@ LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp, LogicalResult workgroupToAIE(IRRewriter &rewriter, AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp, - xilinx::AIEX::RuntimeSequenceOp ipuFuncOp, + xilinx::AIEX::RuntimeSequenceOp npuFuncOp, IRMapping &mapper, IRMapping &bindingsMapper) { OpBuilder::InsertionGuard guard(rewriter); Block *deviceBlock = &deviceOp.getRegion().front(); @@ -810,7 +831,7 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter, return WalkResult::advance(); }) .Case([&](auto controlCodeOp) { - if (failed(controlCodeToAie(rewriter, controlCodeOp, ipuFuncOp, + if (failed(controlCodeToAie(rewriter, controlCodeOp, npuFuncOp, mapper, bindingsMapper))) { controlCodeOp.emitError("could not convert to AIEDialect ops"); return WalkResult::interrupt(); @@ -897,13 +918,13 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { return a.getBinding().getZExtValue() < b.getBinding().getZExtValue(); }); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); - auto ipuFuncOp = rewriter.create( + auto npuFuncOp = rewriter.create( rewriter.getUnknownLoc(), rewriter.getStringAttr(funcOp.getSymName())); - ipuFuncOp.getBody().push_back(new Block); + npuFuncOp.getBody().push_back(new Block); for (int i = 0, e = subspanOps.size(); i < e; i++) { auto a = subspanOps[i].getResult(); - ipuFuncOp.getBody().addArgument(a.getType(), a.getLoc()); - bindingsMapper.map(a, ipuFuncOp.getBody().getArgument(i)); + npuFuncOp.getBody().addArgument(a.getType(), a.getLoc()); + bindingsMapper.map(a, npuFuncOp.getBody().getArgument(i)); } // Walk the AIE regions ops and convert ops into pure AIEDialect ops. @@ -913,7 +934,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { if (isa(op)) { return WalkResult::advance(); } else if (auto workgroupOp = dyn_cast(op)) { - if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, ipuFuncOp, + if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, npuFuncOp, mapper, bindingsMapper))) { return WalkResult::interrupt(); } @@ -928,7 +949,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { if (res.wasInterrupted()) return WalkResult::interrupt(); // Move NPU instruction function to the end of the device block. - rewriter.moveOpBefore(ipuFuncOp, deviceBlock, deviceBlock->end()); + rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end()); // After walking the FuncOp, it has been converted into a DeviceOp and can // safely be erased. eraseOp(rewriter, mapper, funcOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 171525b89..938171a48 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -67,6 +67,7 @@ iree_cc_library( "AMDAIEFuseFillIntoForall.cpp" "AMDAIEFusePackIntoLoop.cpp" "AMDAIEHoistForAffineApply.cpp" + "AMDAIEHoistLogicalObjFifo.cpp" "AMDAIEInsertCores.cpp" "AMDAIEInsertLoopsForVectorization.cpp" "AMDAIELinkExecutables.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 54ede7e21..5bd44c2e7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -45,6 +45,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL #define GEN_PASS_DEF_AMDAIEFUSEPACKINTOLOOP #define GEN_PASS_DEF_AMDAIEHOISTFORLOOPAFFINEAPPLY +#define GEN_PASS_DEF_AMDAIEHOISTLOGICALOBJFIFO #define GEN_PASS_DEF_AMDAIEINSERTAIEWORKGROUP #define GEN_PASS_DEF_AMDAIEINSERTCORES #define GEN_PASS_DEF_AMDAIEINSERTLOOPSFORVECTORIZATION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index a1bd27d91..81c54b413 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -587,6 +587,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); passManager.addPass(createCSEPass()); + passManager.addPass(createAMDAIEHoistLogicalObjFifoPass()); passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass()); passManager.addPass(createAMDAIEFlattenLogicalObjectFifoPass()); passManager.addPass(createAMDAIEAssignLogicalObjectFifoDepthPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 2975fb035..e01890f90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -140,6 +140,10 @@ std::unique_ptr createAMDAIEFuseFillIntoForallPass(); /// Hoist an affine.apply op on a scf.for op's induction variable. std::unique_ptr createAMDAIEHoistForLoopAffineApplyPass(); +/// Create a pass to hoist logical objectFifo operations to the scope of its +/// operands. +std::unique_ptr createAMDAIEHoistLogicalObjFifoPass(); + /// Create a pass to transform linalg.generics into a form which benefits later /// vectorization passes (to vector and aievec dialects). std::unique_ptr createAMDAIEInsertLoopsForVectorizationPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index ee06e34d6..2dec5f951 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -213,6 +213,12 @@ def AMDAIEHoistForLoopAffineApply : Pass<"iree-amdaie-hoist-for-affine-apply"> { let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEHoistForLoopAffineApplyPass()"; } +def AMDAIEHoistLogicalObjFifo : Pass<"iree-amdaie-hoist-logical-objectfifo"> { + let summary = "Hoist logical objectFifo operations to the scope of the most nested of its " + "operands, without hoisting through workgroup, controlcode, or func ops."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEHoistLogicalObjFifoPass()"; +} + def AMDAIEInsertCores : Pass<"iree-amdaie-insert-cores", "ModuleOp"> { let summary = "Insert `amdaie.core` operations inside the innermost " diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index abba1ca33..cf96a5383 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -34,6 +34,7 @@ iree_lit_test_suite( "fuse_fill_into_forall.mlir" "fuse_pack_into_loop.mlir" "hoist_for_affine_apply.mlir" + "hoist_logical_obj_fifo.mlir" "insert_cores.mlir" "insert_loops_for_vectorization.mlir" "localize_logical_objectfifo.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir index 8cb4ba4e7..8d0eba8fe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir @@ -14,11 +14,12 @@ module { // CHECK-LABEL: @single_dma_cpy_nd_on_source // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -29,11 +30,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) amdaie.end } @@ -47,11 +49,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @single_dma_cpy_nd_on_target // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -62,11 +65,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_0[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) + %1 = amdaie.npu.dma_cpy_nd %0(%from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, S2MM) amdaie.end } @@ -82,19 +86,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[C1:.+]] = arith.constant 1 : index // CHECK: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) -// CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) -// CHECK: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) +// CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) @@ -110,17 +117,20 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) - %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [8, 16] [16, 1]) - %2 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [0] [128] [1]) + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd %dma2([] [] [], %from_memref_2[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%0, MM2S) amdaie.npu.dma_wait(%1, MM2S) amdaie.npu.dma_wait(%2, MM2S) @@ -136,15 +146,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @multiple_dma_cpy_with_bd_id_reuse // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -155,15 +166,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2, MM2S) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%3, MM2S) amdaie.end } @@ -181,11 +193,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) // CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], 2) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) @@ -198,13 +211,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) amdaie.npu.dma_wait(%2, MM2S) amdaie.npu.dma_wait(%3, MM2S) @@ -231,19 +245,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 1) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) // CHECK-DAG: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) // CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) -// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) +// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]], MM2S) -// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [] [] [] bd_id = %[[BD_ID_2_0]]) +// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]], MM2S) // CHECK: } // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) @@ -262,23 +279,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.forall (%arg4, %arg5) in (2, 2) { - %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.for %arg6 = %c0 to %c6 step %c1 { - %2 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [1, 128] [128, 1]) - %3 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0] [128] [1]) + %2 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo> + %3 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2, MM2S) amdaie.npu.dma_wait(%3, MM2S) - %4 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [] [] []) + %4 = amdaie.npu.dma_cpy_nd %dma2([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%4, MM2S) } amdaie.npu.dma_wait(%1, MM2S) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir index 94c882d51..5d5ff3997 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir @@ -60,19 +60,19 @@ func.func @core() { // ----- // CHECK-LABEL: @dma_cpy_nd_L3_L2 +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG1:.+]]: memref<8x16xi32> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]], {} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @dma_cpy_nd_L3_L2(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> @@ -84,19 +84,19 @@ func.func @dma_cpy_nd_L3_L2(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi // ----- // CHECK-LABEL: @dma_cpy_nd_L3_L2_target_addressing +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG1:.+]]: memref<8x16xi32> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]], {} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @dma_cpy_nd_L3_L2_target_addressing(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> @@ -108,19 +108,17 @@ func.func @dma_cpy_nd_L3_L2_target_addressing(%arg0: memref<1x1x8x16xi32, 1>, %a // ----- // CHECK-LABEL: @dma_cpy_nd_L2_L3 +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] +// CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]], {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]](%[[FROMMEMREF1]][] [] [], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) func.func @dma_cpy_nd_L2_L3(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> @@ -132,18 +130,18 @@ func.func @dma_cpy_nd_L2_L3(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, // ----- // CHECK-LABEL: @dma_cpy_nd_L2_L3_target_addressing +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] +// CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]], {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: [] [] [] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) func.func @dma_cpy_nd_L2_L3_target_addressing(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { @@ -226,26 +224,26 @@ func.func @for_cores() { // Verify that scf.for is inserted in control code with nested dmas. // // CHECK-LABEL: @for_dma +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: scf.for %[[ARG:.+]] = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, %[[ARG]], 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, %[[ARG]], 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @for_dma(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %c0 = arith.constant 0 : index @@ -308,20 +306,20 @@ func.func @forall_cores() { // Verify that scf.forall is inserted in control code with nested dmas. // // CHECK-LABEL: @forall_dmas +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (2, 2) // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, %[[ARG1]], %[[ARG0]], 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, %[[ARG1]], %[[ARG0]], 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> @@ -337,57 +335,62 @@ func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) // Verify that cores on the same location, but within different scope merge correctly. // // CHECK-LABEL: @merge_cores +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 2> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[TILE_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] +// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF0]][] [] [], %[[PLACEHOLDER]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER2:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF0]][] [] [], %[[PLACEHOLDER2]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_0]], in : [%[[DMA]], %[[DMA2]]], out : []) // CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_1]], in : [%[[DMA]], %[[DMA2]]], out : []) // CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) -// CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] -func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) +// CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] { +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +// CHECK: } +func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 2>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %2 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core_0_0_0 = amdaie.core(%tile_0_0, in : [%2], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } - %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + %core_0_1_0 = amdaie.core(%tile_0_1, in : [%2], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } scf.for %arg2 = %c0 to %c8 step %c1 { - %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { + %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core_0_0_1 = amdaie.core(%tile_0_0, in : [%3], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } - %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { + %core_0_1_1 = amdaie.core(%tile_0_1, in : [%3], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } } @@ -397,64 +400,47 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) // ----- // CHECK-LABEL: @complex_example +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 2>, %[[ARG2:.+]]: memref<1x1x16x16xi32>, %[[ARG3:.+]]: memref<16x16xi32, 2>, %[[ARG4:.+]]: memref<1x1x32x16xi32>, %[[ARG5:.+]]: memref<32x16xi32, 2> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[TILE_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF2:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x16x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF3:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<16x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF4:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF5:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF2]][] [] [] -// CHECK-SAME: %[[FROMMEMREF3]][0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF4]][] [] [] -// CHECK-SAME: %[[FROMMEMREF5]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] +// CHECK-DAG: %[[FROMMEMREF3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG3]] +// CHECK-DAG: %[[FROMMEMREF5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG5]] +// CHECK-DAG: %[[PLACEHOLDER0:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF1]][] [] [], %[[PLACEHOLDER0]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER1:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF3]][] [] [], %[[PLACEHOLDER1]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER2:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF5]][] [] [], %[[PLACEHOLDER2]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF1]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF2]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF3]], Read) // CHECK: linalg.fill -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF1]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF4]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF5]], Read) // CHECK: linalg.fill // CHECK: amdaie.controlcode +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[FROMMEMREF2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] +// CHECK-DAG: %[[FROMMEMREF4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG4]] // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[DMA0]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[DMA0]]([] [] [], %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[DMA1]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], S2MM) -func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>, %arg2: memref<1x1x16x16xi32>, %arg3: memref<16x16xi32, 1>, %arg4: memref<1x1x32x16xi32>, %arg5: memref<32x16xi32, 1>) { +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[DMA1]]([] [] [], %[[FROMMEMREF2]][0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]]([] [] [], %[[FROMMEMREF4]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 2>, %arg2: memref<1x1x16x16xi32>, %arg3: memref<16x16xi32, 2>, %arg4: memref<1x1x32x16xi32>, %arg5: memref<32x16xi32, 2>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index @@ -462,31 +448,31 @@ func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 2> -> !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_memref %arg2, {} : memref<1x1x16x16xi32> -> !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_memref %arg3, {} : memref<16x16xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %arg3, {} : memref<16x16xi32, 2> -> !amdaie.logicalobjectfifo> %4 = amdaie.logicalobjectfifo.from_memref %arg4, {} : memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma_0 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 2> -> !amdaie.logicalobjectfifo> + %dma_0 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } scf.for %iv0 = %c0 to %c8 step %c1 { - %dma_1 = amdaie.dma_cpy_nd(%2[] [] [], %3[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_2 = amdaie.dma_cpy_nd(%4[] [] [], %5[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_1 = amdaie.dma_cpy_nd(%3[] [] [], %2[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_2 = amdaie.dma_cpy_nd(%5[] [] [], %4[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x16x16xi32> - linalg.fill ins(%c0_i32 : i32) outs(%arg2 : memref<1x1x16x16xi32>) + amdaie.logicalobjectfifo.access(%3, Read) : !amdaie.logicalobjectfifo> -> memref<16x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%arg3 : memref<16x16xi32, 2>) amdaie.end } %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x32x16xi32> - linalg.fill ins(%c0_i32 : i32) outs(%arg4 : memref<1x1x32x16xi32>) + amdaie.logicalobjectfifo.access(%5, Read) : !amdaie.logicalobjectfifo> -> memref<32x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%arg5 : memref<32x16xi32, 2>) amdaie.end } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/hoist_logical_obj_fifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/hoist_logical_obj_fifo.mlir new file mode 100644 index 000000000..518213533 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/hoist_logical_obj_fifo.mlir @@ -0,0 +1,205 @@ +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-hoist-logical-objectfifo)" %s | FileCheck %s + +// CHECK-LABEL: @func_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +// CHECK: scf.forall +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref +module { + func.func @func_hoist(%arg0: memref<32x64xi32>) { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + scf.forall (%arg1, %arg2) in (1, 2) { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + return + } +} + + +// ----- + +// CHECK-LABEL: @func_no_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: scf.forall +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +module { + func.func @func_no_hoist(%arg0: memref<32x64xi32>) { + %c0 = arith.constant 0 : index + scf.forall (%arg1, %arg2) in (1, 2) { + %tile_0_0 = amdaie.tile(%c0, %c0) + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + return + } +} + +// ----- + +// CHECK-LABEL: @workgroup_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: amdaie.workgroup +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +// CHECK: scf.forall +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref +func.func @workgroup_hoist(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + scf.forall (%arg1, %arg2) in (1, 2) { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + amdaie.controlcode { + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @workgroup_no_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: amdaie.workgroup +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: scf.forall +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +func.func @workgroup_no_hoist(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + %c0 = arith.constant 0 : index + scf.forall (%arg1, %arg2) in (1, 2) { + %tile_0_0 = amdaie.tile(%c0, %c0) + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + amdaie.controlcode { + amdaie.end + } + } + return +} +// ----- + +// CHECK-LABEL: @workgroup_no_hoist_outside +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.workgroup +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +func.func @workgroup_no_hoist_outside(%arg0: memref<32x64xi32>) { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + amdaie.workgroup { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + amdaie.controlcode { + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @controlcode_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: amdaie.controlcode +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +// CHECK: scf.forall +// CHECK: scf.forall +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref +func.func @controlcode_hoist(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + amdaie.controlcode { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + scf.forall (%arg1, %arg2) in (1, 2) { + scf.forall (%arg3, %arg4) in (1, 2) { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @controlcode_partial_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: amdaie.controlcode +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: scf.forall +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +// CHECK: scf.forall +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref +func.func @controlcode_partial_hoist(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + amdaie.controlcode { + %c0 = arith.constant 0 : index + scf.forall (%arg1, %arg2) in (1, 2) { + %tile_0_0 = amdaie.tile(%c0, %c0) + scf.forall (%arg3, %arg4) in (1, 2) { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @controlcode_no_hoist +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: amdaie.controlcode +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: scf.forall +// CHECK: scf.forall +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +func.func @controlcode_no_hoist(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + amdaie.controlcode { + %c0 = arith.constant 0 : index + scf.forall (%arg1, %arg2) in (1, 2) { + scf.forall (%arg3, %arg4) in (1, 2) { + %tile_0_0 = amdaie.tile(%c0, %c0) + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + } + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @controlcode_no_hoist_outside +// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: amdaie.controlcode +// CHECK: amdaie.logicalobjectfifo.from_memref %[[ARG0]], {%[[TILE_0_0]]} +func.func @controlcode_no_hoist_outside(%arg0: memref<32x64xi32>) { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + amdaie.end + } + } + return +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 6766f04a4..9f5b2f6aa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -420,18 +420,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_2 = amdaie.tile(%c0, %c2) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected to have a target BD ID op}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) amdaie.end } @@ -464,18 +465,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) amdaie.end } @@ -506,18 +508,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -547,18 +550,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -591,16 +595,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -633,16 +638,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -707,24 +713,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_1, S2MM) - %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) + %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_2, MM2S) - %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0] [%c2048] [%c1] bd_id = %bd_id_0) + %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0] [%c2048] [%c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_3, MM2S) amdaie.end } @@ -736,12 +743,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK: aie.device(npu1_4col) { -// CHECK: %[[TILE_0_0:.*]] = aie.tile(0, 0) -// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK-DAG: %[[TILE_0_0:.*]] = aie.tile(0, 0) +// CHECK-DAG: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK-DAG: %[[TILE_1_0:.*]] = aie.tile(1, 0) // CHECK: aie.objectfifo @[[OBJ0:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] -// CHECK-SAME: %[[TILE_0_0]]}, 2 : i32) : !aie.objectfifo> +// CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_1_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> +// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] +// CHECK-SAME: {%[[TILE_1_0]]}, 2 : i32) : !aie.objectfifo> // CHECK: aiex.runtime_sequence @bf16_f32_lit_test // CHECK-SAME: (%[[LHS:.*]]: memref<32x32xbf16>, %[[RHS:.*]]: memref<32x32xbf16>, %[[OUT:.*]]: memref<32x32xf32>) { // CHECK: aiex.npu.dma_memcpy_nd @@ -784,23 +792,27 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> %tile_1 = amdaie.tile(%c0, %c0) + %tile_2 = amdaie.tile(%c1, %c0) %bd_id = amdaie.bd_id(%tile_1, 2) %bd_id_2 = amdaie.bd_id(%tile_1, 1) %bd_id_3 = amdaie.bd_id(%tile_1, 0) - %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo> memref.assume_alignment %3, 64 : memref<32x32xbf16> %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> - %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> memref.assume_alignment %5, 64 : memref<32x32xbf16> %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> - %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> - %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %6[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %11 = amdaie.circular_dma_cpy_nd(%8[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> + %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.circular_dma_cpy_nd(%placeholder2[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %12 = amdaie.npu.dma_cpy_nd %11([%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) - %13 = amdaie.npu.dma_cpy_nd %10([] [] [], [%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) - %14 = amdaie.npu.dma_cpy_nd %9([] [] [], [%c0] [%c1024] [%c1] bd_id = %bd_id) + %obj0 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %obj1 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %obj2 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> + %12 = amdaie.npu.dma_cpy_nd %11(%obj2[%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) : target_type = !amdaie.logicalobjectfifo> + %13 = amdaie.npu.dma_cpy_nd %10([] [] [], %obj1[%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) : source_type = !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %9([] [] [], %obj0[%c0] [%c1024] [%c1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%12, S2MM) amdaie.npu.dma_wait(%13, MM2S) amdaie.npu.dma_wait(%14, MM2S) @@ -837,18 +849,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_1, S2MM) amdaie.end } @@ -931,10 +944,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.assume_alignment %0, 64 : memref<32x64xi32> %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] () %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) { @@ -960,7 +973,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) + %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma, MM2S) amdaie.end } diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index b5a1f13b8..013bc863d 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) diff --git a/tests/transform_dialect/conv_fill_spec_pad.mlir b/tests/transform_dialect/conv_fill_spec_pad.mlir index d1f337ff0..484d71ab8 100644 --- a/tests/transform_dialect/conv_fill_spec_pad.mlir +++ b/tests/transform_dialect/conv_fill_spec_pad.mlir @@ -54,7 +54,7 @@ module attributes { transform.with_named_sequence } { transform.yield } - transform.named_sequence @full_pipeline(%variant_op: !any {transform.consumed}) { + transform.named_sequence @full_pipeline(%variant_op: !any {transform.readonly}) { %ops = transform.structured.match ops{["linalg.fill", "linalg.conv_2d_nchw_fchw"]} in %variant_op : (!any) -> !any %fill, %conv = transform.split_handle %ops : (!any) -> (!any, !any)