diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp index 52871fb88..0fd4932ba 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp @@ -11,7 +11,6 @@ #include "Passes.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/STLExtras.h" -#include "mlir/Analysis/TopologicalSortUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -113,28 +112,11 @@ bool isJoin(ObjectFifoLinkOp op) { return op.getFifoIns().size() > 1; } bool isDistribute(ObjectFifoLinkOp op) { return op.getFifoOuts().size() > 1; } std::optional getOptionalSharedTile(ObjectFifoLinkOp op) { - if (isJoin(op)) { - auto fifoOut = getOutputObjectFifos(op)[0]; - for (auto fifoIn : getInputObjectFifos(op)) - if (fifoOut.getProducerTile() != fifoIn.getConsumerTiles()[0]) return {}; - return {fifoOut.getProducerTile()}; - } - - if (isDistribute(op)) { - auto fifoIn = getInputObjectFifos(op)[0]; - for (auto fifoOut : getOutputObjectFifos(op)) - if (fifoIn.getConsumerTiles()[0] != fifoOut.getProducerTile()) return {}; - return {fifoIn.getConsumerTiles()[0]}; - } - - auto fifoIn = getInputObjectFifos(op); - if (auto fifoOut = getOutputObjectFifos(op); - !fifoIn.empty() && !fifoOut.empty()) - for (auto consumerIn : fifoIn[0].getConsumerTiles()) - if (consumerIn == fifoOut[0].getProducerTile()) - return {fifoOut[0].getProducerTile()}; - return {}; + std::vector fifoOuts = getOutputObjectFifos(op); + assert(fifoOuts.size() > 0); + return fifoOuts[0].getProducerTile(); } + } // namespace class LockAnalysis { @@ -168,21 +150,7 @@ class DMAChannelAnalysis { DenseMap consumerChannelsPerTile; public: - DMAChannelAnalysis(DeviceOp &device) { - // go over the channels used for each tile and update the producer/consumer - // channel maps - for (auto memOp : device.getOps()) { - Region &r = memOp.getBody(); - auto tile = memOp.getTile(); - for (auto &bl : r.getBlocks()) { - for (auto op : bl.getOps()) { - static_cast(op.getChannelDir()) == DMAChannelDir::MM2S - ? getProducerDMAChannel(tile) - : getConsumerDMAChannel(tile); - } - } - } - } + DMAChannelAnalysis() {} /// Given an AIE tile, returns its next usable producer channel. SwitchDMAConnection getProducerDMAChannel(Value tile) { @@ -536,12 +504,6 @@ void replaceReleaseOp( DenseMap, std::vector> &releaseOps) { ObjectFifoCreateOp op = getObjectFifo(releaseOp); - auto core = releaseOp->getParentOfType(); - if (auto linkOp = getOptionalLinkOp(op)) - if (core.getTile() == *getOptionalSharedTile(*linkOp)) - llvm::report_fatal_error( - "currently cannot access objectFifo used in " - "ObjectFifoLinkOp"); auto port = releaseOp.getPort(); std::pair opPort = {op, static_cast(port)}; @@ -653,12 +615,7 @@ void replaceObjectAcquireOp( const DenseMap> &buffersPerFifo, DenseMap> &subviews) { ObjectFifoCreateOp op = getObjectFifo(acquireOp); - auto core = acquireOp->getParentOfType(); auto linkOp = getOptionalLinkOp(op); - if (linkOp && core.getTile() == *getOptionalSharedTile(*linkOp)) - llvm::report_fatal_error( - "currently cannot access objectFifo used in " - "ObjectFifoLinkOp"); // index of next element to acquire for this objectFifo // useful for keeping track of which @@ -995,7 +952,7 @@ struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass { void runOnOperation() override { DeviceOp device = getOperation(); LockAnalysis lockAnalysis(device); - DMAChannelAnalysis dmaAnalysis(device); + DMAChannelAnalysis dmaAnalysis; OpBuilder builder = OpBuilder::atBlockEnd(device.getBody()); // maps each objFifo to its corresponding buffer DenseMap> buffersPerFifo; @@ -1092,16 +1049,14 @@ struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass { } // Remove old ops - SetVector opsToErase; + IRRewriter rewriter(&getContext()); device.walk([&](Operation *op) { if (isa(op)) - opsToErase.insert(op); + ObjectFifoSubviewAccessOp, ObjectFifoReleaseOp>(op)) { + op->dropAllUses(); + rewriter.eraseOp(op); + } }); - topologicalSort(opsToErase); - IRRewriter rewriter(&getContext()); - for (auto it = opsToErase.rbegin(); it != opsToErase.rend(); ++it) - (*it)->erase(); } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir index 28ba3ef42..597a8c409 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir @@ -1,7 +1,7 @@ // RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s -// CHECK-LABEL: aie.device(npu1_4col) { +// CHECK-LABEL: aie.device(xcvc1902) { // CHECK: memref.global "public" @of2_cons : memref<16xi32> // CHECK: memref.global "public" @of2 : memref<16xi32> // CHECK: memref.global "public" @of1_cons : memref<16xi32> @@ -68,7 +68,7 @@ // CHECK: } module @link_AIE1 { - aie.device(npu1_4col) { + aie.device(xcvc1902) { %tile20 = aie.tile(2, 0) %tile12 = aie.tile(1, 2) %tile22 = aie.tile(2, 2) diff --git a/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir deleted file mode 100644 index 3818d0eea..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/tileDMA_test.mlir +++ /dev/null @@ -1,145 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo_cons : memref<16xi32> -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK: %[[OBJFIFO_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0"} : memref<16xi32> -// CHECK: %[[OBJFIFO_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_1"} : memref<16xi32> -// CHECK: %[[OBJFIFO_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]], 0) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock"} -// CHECK: %[[OBJFIFO_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]], 1) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock"} -// CHECK: %[[OBJFIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_buff_0"} : memref<16xi32> -// CHECK: %[[OBJFIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_buff_1"} : memref<16xi32> -// CHECK: %[[OBJFIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]], 3) {init = 2 : i8, sym_name = "objfifo_prod_lock"} -// CHECK: %[[OBJFIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]], 4) {init = 0 : i8, sym_name = "objfifo_cons_lock"} -// CHECK: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]], 0) -// CHECK: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]], 1) -// CHECK: %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) : memref<16xi32> -// CHECK: %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]], 2) -// CHECK: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 0) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2]], Acquire, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 0) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Acquire, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Release, 0) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb4, ^bb5) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Acquire, 0) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb5: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @tileDMA_channels { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile33 = aie.tile(3, 3) - %buff0 = aie.buffer(%tile12) : memref<16xi32> - %lock0 = aie.lock(%tile12, 0) - %buff1 = aie.buffer(%tile12) : memref<16xi32> - %lock1 = aie.lock(%tile12, 1) - %buff2 = aie.buffer(%tile12) : memref<16xi32> - %lock2 = aie.lock(%tile12, 2) - aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - } - aie.end - } - %mem12 = aie.mem(%tile12) { - %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: - aie.use_lock(%lock0, Acquire, 1) - aie.dma_bd(%buff0 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock0, Release, 0) - aie.next_bd ^bb2 - ^bb2: - aie.use_lock(%lock1, Acquire, 1) - aie.dma_bd(%buff1 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock1, Release, 0) - aie.next_bd ^bb1 - ^bb3: - %dma2 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: - aie.use_lock(%lock2, Acquire, 0) - aie.dma_bd(%buff2 : memref<16xi32>) {len = 16 : i32} - aie.use_lock(%lock2, Release, 1) - aie.next_bd ^bb4 - ^bb5: - aie.end - } - } -}