From f9526c92f7201a08a427a4c2112c918718e99e1a Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Wed, 18 Sep 2024 15:59:44 +0200 Subject: [PATCH] Delete StatefulTransform and move logic into AMDAIE passes (#784) This PR removes `AMDAIEStatefulTransform` and `AMDAIEAssignLockIDs` by moving logic into `AMDAIEBufferization`, `AMDAIEAcquireReleaseToUseLock` and `AMDAIELowerToAIE`. This gets rid of an intermediate layer of logic operating on `aie.objectfifo` by going from logical objectFifos directly to aie.buffer. This reduces the overall amount of code/complexity needed to get to the same result. --- .../AMD-AIE/aie/AMDAIEAssignLockIDs.cpp | 118 -- .../aie/AMDAIEObjectFifoStatefulTransform.cpp | 797 ---------- .../plugins/target/AMD-AIE/aie/CMakeLists.txt | 2 - compiler/plugins/target/AMD-AIE/aie/Passes.h | 2 - .../aie/test/AIE2_cyclostatic_dma.mlir | 181 --- .../AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir | 182 --- .../AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir | 244 --- .../aie/test/AIE2_delayed_release.mlir | 125 -- .../AMD-AIE/aie/test/assign-lockIDs.mlir | 129 -- .../AMD-AIE/aie/test/base_test_AIE1.mlir | 123 -- .../AMD-AIE/aie/test/base_test_AIE2.mlir | 123 -- .../AMD-AIE/aie/test/broadcast_test.mlir | 374 ----- .../AMD-AIE/aie/test/link_test_AIE1.mlir | 80 - .../AMD-AIE/aie/test/link_test_DDR_to_L1.mlir | 80 - .../AMD-AIE/aie/test/link_test_L1_to_DDR.mlir | 81 - .../AMD-AIE/aie/test/link_test_broadcast.mlir | 136 -- .../aie/test/link_test_distribute.mlir | 155 -- .../AMD-AIE/aie/test/link_test_join.mlir | 191 --- .../target/AMD-AIE/aie/test/matmul_test.mlir | 188 --- .../target/AMD-AIE/aie/test/memTile_test.mlir | 55 - .../AMD-AIE/aie/test/nd_dma_base_AIE2.mlir | 126 -- .../aie/test/nd_dma_distribute_AIE2.mlir | 123 -- .../test/nd_dma_multiple_consumers_AIE2.mlir | 201 --- .../AMD-AIE/aie/test/nested_loop_test.mlir | 365 ----- .../aie/test/non_adjacency_test_1.mlir | 125 -- .../aie/test/non_adjacency_test_2.mlir | 139 -- .../aie/test/non_adjacency_test_AIE2.mlir | 122 -- .../test/register_external_buffers_test.mlir | 75 - .../same_core_producer_consumer_test.mlir | 103 -- .../AMD-AIE/aie/test/shimRow_mem_test.mlir | 75 - .../AMD-AIE/aie/test/shim_AIE2_test.mlir | 68 - .../AMD-AIE/aie/test/shim_broadcast_test.mlir | 88 -- .../AMD-AIE/aie/test/subview_test_1.mlir | 132 -- .../AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td | 11 + .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp | 10 +- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 40 +- .../iree-amd-aie/PluginRegistration.cpp | 2 - .../AMDAIEAcquireReleaseToUseLock.cpp | 234 +++ .../Transforms/AMDAIECoreLoopUnroll.cpp | 84 -- .../Transforms/AMDAIELowerToAIE.cpp | 1000 ++++++------ .../Transforms/AMDAIELowerToAIE.h | 132 ++ .../iree-amd-aie/Transforms/CMakeLists.txt | 3 +- .../iree-amd-aie/Transforms/PassDetail.h | 2 +- .../iree-amd-aie/Transforms/Passes.cpp | 9 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 8 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 13 +- .../Transforms/test/CMakeLists.txt | 2 +- .../test/acquire_release_to_use_lock.mlir | 214 +++ .../Transforms/test/core_loop_unroll.mlir | 181 --- .../Transforms/test/lower_to_aie.mlir | 1335 +++++++++++------ .../aie_runtime/Utils/CMakeLists.txt | 1 + 51 files changed, 2092 insertions(+), 6297 deletions(-) delete mode 100644 compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp delete mode 100644 compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp delete mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir delete mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp deleted file mode 100644 index 3d6efe6fc..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -// This pass aims to assign lockIDs to AIE.lock operations. The lockID is -// numbered from the most recent AIE.lock within the same tile. If the lockID -// exceeds the number of locks on the tile, the pass generates an error and -// terminates. AIE.lock operations for different tiles are numbered -// independently. If there are existing lock IDs, this pass is idempotent -// and only assigns lock IDs to locks without an ID. - -#include "AIEDialect.h" -#include "Passes.h" -#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" -#include "llvm/ADT/DenseMap.h" -#include "mlir/Pass/Pass.h" - -#define DEBUG_TYPE "amdaie-assign-lock-ids" - -using namespace mlir; -using namespace xilinx; -using namespace xilinx::AIE; - -namespace mlir::iree_compiler::AMDAIE { -struct AMDAIEAssignLockIDsPass : mlir::OperationPass { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AMDAIEAssignLockIDsPass) - - AMDAIEAssignLockIDsPass() : mlir::OperationPass(resolveTypeID()) {} - - llvm::StringRef getArgument() const override { - return "amdaie-assign-lock-ids"; - } - - llvm::StringRef getName() const override { return "AMDAIEAssignLockIDsPass"; } - - std::unique_ptr clonePass() const override { - return std::make_unique( - *static_cast(this)); - } - - void runOnOperation() override { - DeviceOp device = getOperation(); - OpBuilder rewriter = OpBuilder::atBlockEnd(device.getBody()); - - // All of the lock ops on a tile, separated into ops which have been - // assigned to a lock, and ops which have not. - struct TileLockOps { - DenseSet assigned; - SmallVector unassigned; - }; - - DenseMap tileToLocks; - - // Construct data structure storing locks by tile. - device.walk([&](LockOp lockOp) { - TileOp tileOp = xilinx::AIE::getTileOp(*lockOp); - if (lockOp.getLockID().has_value()) { - auto lockID = lockOp.getLockID().value(); - auto iter = tileToLocks.find(tileOp); - if (iter == tileToLocks.end()) - tileToLocks.insert({tileOp, {{lockID}, /* unassigned = */ {}}}); - else { - if (iter->second.assigned.find(lockID) != - iter->second.assigned.end()) { - auto diag = lockOp->emitOpError("is assigned to the same lock (") - << lockID << ") as another op."; - diag.attachNote(tileOp.getLoc()) - << "tile has lock ops assigned to same lock."; - return signalPassFailure(); - } - iter->second.assigned.insert(lockID); - } - } else { - auto iter = tileToLocks.find(tileOp); - if (iter == tileToLocks.end()) - tileToLocks.insert({tileOp, {/* assigned = */ {}, {lockOp}}}); - else - iter->second.unassigned.push_back(lockOp); - } - }); - - AMDAIEDeviceModel deviceModel = mlir::iree_compiler::AMDAIE::getDeviceModel( - static_cast(device.getDevice())); - // IR mutation: assign locks to all unassigned lock ops. - for (auto [tileOp, locks] : tileToLocks) { - uint32_t locksPerTile = - deviceModel.getNumLocks(tileOp.getCol(), tileOp.getRow()); - uint32_t nextID = 0; - for (auto lockOp : locks.unassigned) { - while (nextID < locksPerTile && - (locks.assigned.find(nextID) != locks.assigned.end())) { - ++nextID; - } - if (nextID == locksPerTile) { - mlir::InFlightDiagnostic diag = - lockOp->emitOpError("not allocated a lock."); - diag.attachNote(tileOp.getLoc()) << "because only " << locksPerTile - << " locks available in this tile."; - return signalPassFailure(); - } - lockOp.setLockIDAttr(rewriter.getI8IntegerAttr(nextID)); - ++nextID; - } - } - } -}; -std::unique_ptr> createAMDAIEAssignLockIDsPass() { - return std::make_unique(); -} - -void registerAMDAIEAssignLockIDs() { - mlir::registerPass([]() -> std::unique_ptr { - return createAMDAIEAssignLockIDsPass(); - }); -} -} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp deleted file mode 100644 index 5b4d3e6e3..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp +++ /dev/null @@ -1,797 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "AIEDialect.h" -#include "Passes.h" -#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" -#include "llvm/ADT/SetVector.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Pass/Pass.h" - -#define DEBUG_TYPE "amdaie-objectFifo-stateful-transform" - -using namespace mlir; -using namespace mlir::iree_compiler::AMDAIE; - -using xilinx::AIE::AIEObjectFifoType; -using xilinx::AIE::BDDimLayoutArrayAttr; -using xilinx::AIE::BufferOp; -using xilinx::AIE::CoreOp; -using xilinx::AIE::DeviceOp; -using xilinx::AIE::DMABDOp; -using xilinx::AIE::DMAStartOp; -using xilinx::AIE::EndOp; -using xilinx::AIE::FlowOp; -using xilinx::AIE::LockAction; -using xilinx::AIE::LockOp; -using xilinx::AIE::MemOp; -using xilinx::AIE::MemTileDMAOp; -using xilinx::AIE::NextBDOp; -using xilinx::AIE::ObjectFifoAcquireOp; -using xilinx::AIE::ObjectFifoCreateOp; -using xilinx::AIE::ObjectFifoLinkOp; -using xilinx::AIE::ObjectFifoPort; -using xilinx::AIE::ObjectFifoReleaseOp; -using xilinx::AIE::ObjectFifoSubviewAccessOp; -using xilinx::AIE::ShimDMAAllocationOp; -using xilinx::AIE::TileOp; -using xilinx::AIE::UseLockOp; - -namespace { - -struct LockResources { - // Reference to the producer and consumer lock ops created for this resource. - std::pair locks; - // The acquire and release values to be used for producer and consumer locks - // for this resource. - std::pair locksAcqRel; - LockResources() {} - LockResources(const std::pair &locks, - const std::pair &locksAcqRel) - : locks(locks), locksAcqRel(locksAcqRel) {} -}; - -struct ObjectFifoEndpointResource { - // The buffers used for this objectFifo endpoint (multiple: double buffering). - SmallVector buffers; - // The lock resources used for this objectFifo endpoint. - LockResources lockResources; - ObjectFifoEndpointResource() {} - ObjectFifoEndpointResource(const SmallVector &buffers, - LockResources &&lockResources) - : buffers(buffers), lockResources(std::move(lockResources)) {} -}; - -struct ObjectFifoResources { - // Offset on the producer's side of the objectFifo. - uint32_t producerOffset{0}; - ObjectFifoEndpointResource producerResource; - // Offset on the consumers' side of the objectFifo. - uint32_t consumersOffset{0}; - DenseMap consumerResources; - ObjectFifoResources() {} - ObjectFifoResources(uint32_t producerOffset, uint32_t consumersOffset) - : producerOffset(producerOffset), consumersOffset(consumersOffset) {} -}; - -SmallVector getInputObjectFifos(ObjectFifoLinkOp &op) { - SmallVector inputObjFifos; - Operation *parent = op.getOperation(); - while ((parent = parent->getParentOp())) { - if (parent->hasTrait()) { - for (auto sym : op.getFifoIns()) { - auto name = dyn_cast(sym); - if (auto *st = SymbolTable::lookupSymbolIn(parent, name); - isa_and_nonnull(st)) - inputObjFifos.push_back(dyn_cast(st)); - } - } - } - return inputObjFifos; -} - -SmallVector getOutputObjectFifos(ObjectFifoLinkOp &op) { - SmallVector outputObjFifos; - Operation *parent = op.getOperation(); - while ((parent = parent->getParentOp())) { - if (parent->hasTrait()) { - for (auto sym : op.getFifoOuts()) { - auto name = dyn_cast(sym); - if (auto *st = SymbolTable::lookupSymbolIn(parent, name); - isa_and_nonnull(st)) - outputObjFifos.push_back(dyn_cast(st)); - } - } - } - return outputObjFifos; -} - -int objFifoSize(ObjectFifoCreateOp op, int index = 0) { - if (llvm::isa(op.getElemNumber())) { - return llvm::dyn_cast( - llvm::dyn_cast(op.getElemNumber())[index]) - .getInt(); - } else { - return llvm::dyn_cast(op.getElemNumber()).getInt(); - } -} - -template -ObjectFifoCreateOp getObjectFifo(T op) { - Operation *parent = op.getOperation(); - while ((parent = parent->getParentOp())) { - if (parent->hasTrait()) { - if (auto *st = SymbolTable::lookupSymbolIn(parent, op.getObjFifoName()); - isa_and_nonnull(st)) - return dyn_cast(st); - } - } - return {}; -} - -bool isJoin(ObjectFifoLinkOp op) { - return op.getFifoIns().size() > 1 && op.getFifoOuts().size() == 1; -} - -bool isDistribute(ObjectFifoLinkOp op) { - return op.getFifoOuts().size() > 1 && op.getFifoIns().size() == 1; -} - -bool isOneToOne(ObjectFifoLinkOp op) { - return op.getFifoIns().size() == 1 && op.getFifoOuts().size() == 1; -} - -/// Retrieve ObjectFifoLinkOp of ObjectFifoCreateOp, -/// if it belongs to one. -std::optional getOptionalLinkOp(ObjectFifoCreateOp op) { - auto device = op->getParentOfType(); - for (ObjectFifoLinkOp linkOp : device.getOps()) { - for (ObjectFifoCreateOp in : getInputObjectFifos(linkOp)) - if (in == op) return {linkOp}; - for (ObjectFifoCreateOp out : getOutputObjectFifos(linkOp)) - if (out == op) return {linkOp}; - } - return {}; -} - -} // namespace - -template -void createDMA(DeviceOp &device, OpBuilder &builder, TileOp tileOp, - DMAChannelDir channelDir, int channelIndex, - BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum, - int64_t len, int64_t offset, - const SmallVector &bufferOps, - const std::pair &locks) { - OpBuilder::InsertionGuard g(builder); - Operation *producer = nullptr; - for (auto memOp : device.getOps()) { - if (memOp.getTile() == tileOp.getResult()) { - producer = memOp.getOperation(); - break; - } - } - - // if none exists, create one - if (!producer) { - if (device->getNumRegions() != 1) - llvm::report_fatal_error("expected num regions for device op"); - OpBuilder::InsertionGuard gg(builder); - builder.setInsertionPointToEnd(device.getBody()); - auto newMemOp = builder.create(builder.getUnknownLoc(), tileOp); - { - OpBuilder::InsertionGuard ggg(builder); - builder.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock()); - builder.create(builder.getUnknownLoc()); - } - producer = newMemOp.getOperation(); - } - - Block &endBlock = producer->getRegion(0).getBlocks().back(); - assert(!endBlock.getOps().empty() && - "expected last block to have aie.end"); - Block *lastDmaBlock = endBlock.getSinglePredecessor(), - *dmaBlock = builder.createBlock(&endBlock), - *bdBlock = builder.createBlock(&endBlock); - - // create DMA channel - { - OpBuilder::InsertionGuard gg(builder); - builder.setInsertionPointToStart(dmaBlock); - builder.create(builder.getUnknownLoc(), channelDir, - channelIndex, /*repeatCount*/ 0, bdBlock, - &endBlock); - } - if (lastDmaBlock) lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1); - - auto createBdBlockOps = [&](BufferOp buff, Block *succ) { - LockOp acqLock = locks.first, relLock = locks.second; - builder.create(builder.getUnknownLoc(), acqLock, - LockAction::AcquireGreaterEqual, acqNum); - if (!dims.getValue().empty()) { - builder.create(builder.getUnknownLoc(), buff, offset, len, dims); - } else { - builder.create(builder.getUnknownLoc(), buff, offset, len); - } - builder.create(builder.getUnknownLoc(), relLock, - LockAction::Release, relNum); - builder.create(builder.getUnknownLoc(), succ); - }; - - // create Bd blocks - Block *succ = nullptr, *curr = bdBlock; - for (size_t blockIndex = 0; blockIndex < bufferOps.size(); ++blockIndex) { - if (blockIndex == bufferOps.size() - 1) { - succ = bdBlock; - } else { - succ = builder.createBlock(&endBlock); - } - - OpBuilder::InsertionGuard gg(builder); - builder.setInsertionPointToStart(curr); - createBdBlockOps(bufferOps[blockIndex], succ); - curr = succ; - } -} - -template -void createTileDMA(DeviceOp &device, OpBuilder &builder, TileOp tileOp, - DMAChannelDir channelDir, uint8_t channelIndex, size_t size, - BDDimLayoutArrayAttr dims, uint32_t offset, - const ObjectFifoEndpointResource &endpointResource) { - std::pair locks = endpointResource.lockResources.locks; - uint8_t acqNum = endpointResource.lockResources.locksAcqRel.first; - uint8_t relNum = endpointResource.lockResources.locksAcqRel.second; - createDMA(device, builder, tileOp, channelDir, channelIndex, dims, - acqNum, relNum, size, offset, endpointResource.buffers, - locks); -} - -LogicalResult createUseLocks( - OpBuilder &builder, ObjectFifoCreateOp op, ObjectFifoPort port, - size_t numLocks, LockAction lockAction, - const ObjectFifoEndpointResource &endpointResource) { - if (numLocks == 0) return failure(); - LockOp lock; - if (lockAction == LockAction::AcquireGreaterEqual) { - lock = endpointResource.lockResources.locks.second; - } else if (lockAction == LockAction::Release) { - lock = endpointResource.lockResources.locks.first; - } else { - return op.emitOpError() << "unsupported lock action on this resource: " - << stringifyEnum(lockAction); - } - builder.create(builder.getUnknownLoc(), lock, lockAction, - numLocks); - return success(); -} - -LogicalResult replaceReleaseOp( - OpBuilder &builder, ObjectFifoReleaseOp releaseOp, TileOp tileOp, - const DenseMap &resourceMap) { - OpBuilder::InsertionGuard g(builder); - ObjectFifoCreateOp op = getObjectFifo(releaseOp); - auto port = releaseOp.getPort(); - const ObjectFifoEndpointResource &endpointResource = - port == ObjectFifoPort::Produce - ? resourceMap.at(op).producerResource - : resourceMap.at(op).consumerResources.at(tileOp); - builder.setInsertionPointAfter(releaseOp); - return createUseLocks(builder, op, port, releaseOp.getSize(), - LockAction::Release, endpointResource); -} - -LogicalResult replaceObjectAcquireOp( - OpBuilder &builder, ObjectFifoAcquireOp acquireOp, TileOp tileOp, - DenseMap &createOpToIndex, - const DenseMap &resourceMap) { - OpBuilder::InsertionGuard g(builder); - ObjectFifoCreateOp op = getObjectFifo(acquireOp); - if (!createOpToIndex.contains(op)) createOpToIndex[op] = 0; - auto port = acquireOp.getPort(); - const ObjectFifoEndpointResource &endpointResource = - port == ObjectFifoPort::Produce - ? resourceMap.at(op).producerResource - : resourceMap.at(op).consumerResources.at(tileOp); - - builder.setInsertionPointAfter(acquireOp); - if (failed(createUseLocks(builder, op, port, acquireOp.getSize(), - LockAction::AcquireGreaterEqual, - endpointResource))) { - return failure(); - } - - for (Operation *userOp : acquireOp->getUsers()) { - auto subviewAccessOp = dyn_cast(userOp); - if (!subviewAccessOp) { - return acquireOp.emitOpError() - << "currently only supports `aie.objectfifo.subview.access` users"; - } - size_t index = subviewAccessOp.getIndex(); - size_t bufferIndex = - (createOpToIndex[op] + index) % endpointResource.buffers.size(); - BufferOp bufferOp = endpointResource.buffers[bufferIndex]; - subviewAccessOp.getResult().replaceAllUsesWith(bufferOp.getResult()); - } - // Increment index to rotate through available buffers objectFifo acquires. - createOpToIndex[op] += acquireOp.getSize(); - return success(); -} - -/// Utility to create a vector of buffer ops for an objectFifo. -SmallVector createBuffers(OpBuilder &builder, - const AMDAIEDeviceModel &deviceModel, - ObjectFifoCreateOp createOp, - size_t numBuffers, TileOp tile, - const std::string &prefix, size_t index) { - SmallVector buffers; - if (deviceModel.isShimTile(tile.getCol(), tile.getRow())) return buffers; - auto fifoType = cast(createOp.getElemType()); - auto elemType = cast(fifoType.getElementType()); - for (int ofElemIndex = 0; ofElemIndex < numBuffers; ofElemIndex++) { - auto buff = builder.create( - builder.getUnknownLoc(), elemType, tile, - builder.getStringAttr(prefix + "_buff_" + std::to_string(index) + "_" + - std::to_string(ofElemIndex)), - /*address*/ nullptr, - /*mem_bank*/ nullptr); - buffers.push_back(buff); - } - return buffers; -} - -std::pair createLockPair(OpBuilder &builder, - const AMDAIEDeviceModel &deviceModel, - TileOp tile, int depth, - const std::string &prefix, - size_t index) { - // TODO(jornt): make this more extensible towards different lock - // schemes. - int producerInitValue{depth}; - int consumerInitValue{0}; - // Use no lock value for shim tiles as the shim DMAs don't need to be - // synchronized. TODO(jornt): we might be able to just not create any locks - // for shims, see buffers. - if (deviceModel.isShimTile(tile.getCol(), tile.getRow())) - producerInitValue = 0; - LockOp producerLock = builder.create( - builder.getUnknownLoc(), tile, IntegerAttr{}, - builder.getI8IntegerAttr(producerInitValue), - builder.getStringAttr(prefix + "_prod_lock_" + std::to_string(index))); - LockOp consumerLock = builder.create( - builder.getUnknownLoc(), tile, IntegerAttr{}, - builder.getI8IntegerAttr(consumerInitValue), - builder.getStringAttr(prefix + "_cons_lock_" + std::to_string(index))); - return std::make_pair(producerLock, consumerLock); -} - -/// Utility to create buffers and locks for the objectFifo producer side. -LogicalResult createProducerBuffersAndLocks( - OpBuilder &builder, const AMDAIEDeviceModel &deviceModel, - ObjectFifoCreateOp createOp, size_t index, - DenseMap &resourceMap) { - OpBuilder::InsertionGuard g(builder); - TileOp producerTileOp = - dyn_cast_if_present(createOp.getProducerTile().getDefiningOp()); - if (!producerTileOp) { - return createOp.emitOpError() << "expected a producer tile op, but got: " - << createOp.getProducerTile(); - } - size_t depth = objFifoSize(createOp); - SmallVector producerBuffers = - createBuffers(builder, deviceModel, createOp, depth, producerTileOp, - name(createOp).str() + "_prod", index); - std::pair lockPair = - createLockPair(builder, deviceModel, producerTileOp, depth, - name(createOp).str() + "_prod", index); - // Swap for producers to synchronize with potential consumers on the other - // side. - std::swap(lockPair.first, lockPair.second); - std::pair lockAcqRel = std::make_pair(1, 1); - resourceMap[createOp].producerResource = ObjectFifoEndpointResource( - producerBuffers, LockResources(lockPair, lockAcqRel)); - return success(); -} - -/// Utility to create buffers and locks for the objectFifo consumer side. -LogicalResult createConsumerBuffersAndLocks( - OpBuilder &builder, const AMDAIEDeviceModel &deviceModel, - ObjectFifoCreateOp createOp, size_t external_idx, - DenseMap &resourceMap) { - OpBuilder::InsertionGuard g(builder); - resourceMap[createOp].consumerResources.clear(); - size_t depth = objFifoSize(createOp); - for (auto &&[idx1, consumerTile] : - llvm::enumerate(createOp.getConsumerTiles())) { - size_t idx = external_idx * createOp.getConsumerTiles().size() + idx1; - TileOp consumerTileOp = - dyn_cast_if_present(consumerTile.getDefiningOp()); - if (!consumerTileOp) { - return createOp.emitOpError() - << "expected a consumer tile op, but got: " << consumerTile; - } - SmallVector consumerBuffers = - createBuffers(builder, deviceModel, createOp, depth, consumerTileOp, - name(createOp).str() + "_cons", idx); - std::pair lockPair = - createLockPair(builder, deviceModel, consumerTileOp, depth, - name(createOp).str() + "_cons", idx); - std::pair lockAcqRel = std::make_pair(1, 1); - resourceMap[createOp].consumerResources[consumerTileOp] = - ObjectFifoEndpointResource(consumerBuffers, - LockResources(lockPair, lockAcqRel)); - } - return success(); -} - -LogicalResult createBuffersAndLocks( - OpBuilder &builder, DeviceOp device, ObjectFifoLinkOp linkOp, - DenseMap &resourceMap) { - OpBuilder::InsertionGuard g(builder); - AMDAIEDeviceModel deviceModel = - getDeviceModel(static_cast(device.getDevice())); - - SmallVector inputs = getInputObjectFifos(linkOp); - SmallVector outputs = getOutputObjectFifos(linkOp); - assert(inputs.size() > 0 && "there should be inputs in the link op"); - assert(outputs.size() > 0 && "there should be outputs in the link op"); - uint32_t inputsOffset{0}; - for (ObjectFifoCreateOp input : inputs) { - resourceMap[input] = ObjectFifoResources(0, inputsOffset); - auto fifoType = cast(input.getElemType()); - auto fifoElemType = cast(fifoType.getElementType()); - inputsOffset += fifoElemType.getNumElements(); - } - uint32_t outputsOffset{0}; - for (ObjectFifoCreateOp output : outputs) { - resourceMap[output] = ObjectFifoResources(outputsOffset, 0); - auto fifoType = cast(output.getElemType()); - auto fifoElemType = cast(fifoType.getElementType()); - outputsOffset += fifoElemType.getNumElements(); - } - - ObjectFifoCreateOp linkCreateOp; - SmallVector linkOtherOps; - TileOp linkTileOp; - if (isJoin(linkOp)) { - assert(outputs.size() == 1 && "single output expected"); - linkCreateOp = outputs[0]; - linkOtherOps = inputs; - linkTileOp = dyn_cast_if_present( - linkCreateOp.getProducerTile().getDefiningOp()); - } else if (isDistribute(linkOp)) { - assert(inputs.size() == 1 && "single input expected"); - linkCreateOp = inputs[0]; - linkOtherOps = outputs; - linkTileOp = dyn_cast_if_present( - linkCreateOp.getConsumerTiles()[0].getDefiningOp()); - } else if (isOneToOne(linkOp)) { - auto inFifoType = cast(inputs[0].getElemType()); - auto inFifoElemType = cast(inFifoType.getElementType()); - auto outFifoType = cast(outputs[0].getElemType()); - auto outFifoElemType = cast(outFifoType.getElementType()); - if (inFifoElemType.getNumElements() >= outFifoElemType.getNumElements()) { - linkCreateOp = inputs[0]; - linkOtherOps = outputs; - linkTileOp = dyn_cast_if_present( - linkCreateOp.getConsumerTiles()[0].getDefiningOp()); - } else { - linkCreateOp = outputs[0]; - linkOtherOps = inputs; - linkTileOp = dyn_cast_if_present( - linkCreateOp.getProducerTile().getDefiningOp()); - } - } else { - return linkOp.emitOpError() - << "only join or distribute link supported currently"; - } - if (!linkTileOp) { - return linkCreateOp.emitOpError() << "expected a tile op"; - } - - size_t depth = objFifoSize(linkCreateOp); - if (!depth) return linkCreateOp.emitOpError() << "doesn't have a size"; - - // Reset opbuilder location to after the last tile declaration - auto tiles = device.getBody()->getOps(); - assert(!tiles.empty() && "no tiles in device"); - builder.setInsertionPointAfter(*std::prev(tiles.end(), 1)); - - { - SmallVector linkBuffers = - createBuffers(builder, deviceModel, linkCreateOp, depth, linkTileOp, - name(linkCreateOp).str() + "_link", 0); - size_t linkDepth = depth * linkOtherOps.size(); - std::pair linkLockPair = - createLockPair(builder, deviceModel, linkTileOp, linkDepth, - name(linkCreateOp).str() + "_link", 0); - uint8_t inputAcqRelValue = linkDepth / depth / inputs.size(); - std::pair inputLockAcqRel = - std::make_pair(inputAcqRelValue, inputAcqRelValue); - for (ObjectFifoCreateOp input : inputs) { - resourceMap[input].consumerResources[linkTileOp] = - ObjectFifoEndpointResource( - linkBuffers, LockResources(linkLockPair, inputLockAcqRel)); - } - // Swap locks for outputs to synchronize link inputs and outputs. - std::swap(linkLockPair.first, linkLockPair.second); - uint8_t outputAcqRelValue = linkDepth / depth / outputs.size(); - std::pair outputLockAcqRel = - std::make_pair(outputAcqRelValue, outputAcqRelValue); - for (ObjectFifoCreateOp output : outputs) { - resourceMap[output].producerResource = ObjectFifoEndpointResource( - linkBuffers, LockResources(linkLockPair, outputLockAcqRel)); - } - } - - for (auto &&[idx, input] : llvm::enumerate(inputs)) { - if (failed(createProducerBuffersAndLocks(builder, deviceModel, input, idx, - resourceMap))) { - return failure(); - } - } - - for (auto &&[idx, output] : llvm::enumerate(outputs)) { - if (failed(createConsumerBuffersAndLocks(builder, deviceModel, output, idx, - resourceMap))) { - return failure(); - } - } - return success(); -} - -LogicalResult createBuffersAndLocksForNonLinkOps( - OpBuilder &builder, DeviceOp device, ObjectFifoCreateOp createOp, - DenseMap &resourceMap) { - // Skip objectFifoCreateOps in links. - if (getOptionalLinkOp(createOp)) return success(); - OpBuilder::InsertionGuard g(builder); - AMDAIEDeviceModel deviceModel = - getDeviceModel(static_cast(device.getDevice())); - resourceMap[createOp] = ObjectFifoResources(0, 0); - size_t depth = objFifoSize(createOp); - if (!depth) return createOp.emitOpError() << "doesn't have a depth size"; - - // Reset opbuilder location to after the last tile declaration - auto tiles = device.getBody()->getOps(); - assert(!tiles.empty() && "no tiles in device"); - builder.setInsertionPointAfter(*std::prev(tiles.end(), 1)); - if (failed(createProducerBuffersAndLocks(builder, deviceModel, createOp, 0, - resourceMap))) { - return failure(); - } - if (failed(createConsumerBuffersAndLocks(builder, deviceModel, createOp, 0, - resourceMap))) { - return failure(); - } - return success(); -} - -LogicalResult createTileDMAs( - OpBuilder &builder, DeviceOp device, ObjectFifoCreateOp createOp, - DenseMap &resourceMap, - const DenseMap> &symbolToFlowOps) { - OpBuilder::InsertionGuard g(builder); - AMDAIEDeviceModel deviceModel = - getDeviceModel(static_cast(device.getDevice())); - - auto createDMA = [&deviceModel, &device, &builder]( - TileOp tileOp, DMAChannelDir channelDir, - uint8_t channelIndex, size_t size, - BDDimLayoutArrayAttr dims, StringRef name, - uint32_t offset, - const ObjectFifoEndpointResource &endpointResource) { - if (deviceModel.isShimTile(tileOp.getCol(), tileOp.getRow())) { - builder.create(builder.getUnknownLoc(), name, - channelDir, channelIndex, - tileOp.getCol()); - } else if (deviceModel.isMemTile(tileOp.getCol(), tileOp.getRow())) { - createTileDMA(device, builder, tileOp, channelDir, - channelIndex, size, dims, offset, - endpointResource); - } else { - createTileDMA(device, builder, tileOp, channelDir, channelIndex, - size, dims, offset, endpointResource); - } - }; - - // Collect producer and consumer DMA channels - if (!symbolToFlowOps.contains(createOp.getSymName())) { - return createOp.emitOpError() - << "symbol name not found in symbol to flow ops map"; - } - SmallVector flowOps = symbolToFlowOps.at(createOp.getSymName()); - SmallVector producerChannelsVec = llvm::map_to_vector( - flowOps, [](FlowOp flowOp) { return flowOp.getSourceChannel(); }); - llvm::SmallSetVector producerChannels(producerChannelsVec.begin(), - producerChannelsVec.end()); - if (producerChannels.size() != 1) - return createOp.emitOpError() << "expected a single producer channel"; - DenseMap consumerChannelsMap; - for (FlowOp flowOp : flowOps) - consumerChannelsMap[flowOp.getDest()] = flowOp.getDestChannel(); - if (consumerChannelsMap.size() != createOp.getConsumerTiles().size()) { - return createOp.emitOpError() << "expected same number of consumers as the " - "number of objectFifo consumers"; - } - - auto fifo = cast(createOp.getElemType()); - auto elemType = cast(fifo.getElementType()); - size_t size = elemType.getNumElements(); - - // create producer tile DMA - builder.setInsertionPoint(&device.getBody()->back()); - TileOp producerTileOp = - dyn_cast_if_present(createOp.getProducerTile().getDefiningOp()); - if (!producerTileOp) - return createOp.emitOpError() << "expected a producer TileOp"; - const ObjectFifoResources &opResource = resourceMap[createOp]; - const ObjectFifoEndpointResource &producerEndpointResource = - opResource.producerResource; - uint32_t producerOffset = opResource.producerOffset; - createDMA(producerTileOp, DMAChannelDir::MM2S, producerChannels[0], size, - createOp.getDimensionsToStreamAttr(), createOp.getName(), - producerOffset, producerEndpointResource); - - assert(opResource.consumerResources.size() == - createOp.getConsumerTiles().size() && - "same number of consumer resources expected as the number of consumer " - "tiles on the objectFifo"); - for (auto &&[idx, consumerTile] : - llvm::enumerate(createOp.getConsumerTiles())) { - TileOp consumerTileOp = - dyn_cast_if_present(consumerTile.getDefiningOp()); - if (!consumerTileOp) { - return createOp.emitOpError() - << "expected a consumer TileOp, but got: " << consumerTile; - } - if (!consumerChannelsMap.contains(consumerTile)) { - return createOp.emitOpError() - << "did not find consumer tile (" << consumerTile - << ") in consumerChannelsMap"; - } - uint8_t consumerChannel = consumerChannelsMap[consumerTile]; - - // create consumer tile DMA - BDDimLayoutArrayAttr consumerDims = - createOp.getDimensionsFromStreamPerConsumer()[idx]; - uint32_t consumersOffset = opResource.consumersOffset; - const ObjectFifoEndpointResource &consumerEndpointResource = - opResource.consumerResources.at(consumerTileOp); - createDMA(consumerTileOp, DMAChannelDir::S2MM, consumerChannel, size, - consumerDims, createOp.getName(), consumersOffset, - consumerEndpointResource); - } - return success(); -} - -namespace mlir::iree_compiler::AMDAIE { -struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( - AMDAIEObjectFifoStatefulTransformPass) - - AMDAIEObjectFifoStatefulTransformPass() - : mlir::OperationPass(resolveTypeID()) {} - - llvm::StringRef getArgument() const override { - return "amdaie-objectFifo-stateful-transform"; - } - - llvm::StringRef getName() const override { - return " AMDAIEObjectFifoStatefulTransformPass"; - } - - std::unique_ptr clonePass() const override { - return std::make_unique( - *static_cast(this)); - } - - void getDependentDialects(::mlir::DialectRegistry ®istry) const override { - registry.insert(); - registry.insert(); - } - - void runOnOperation() override { - DeviceOp device = getOperation(); - OpBuilder builder = OpBuilder::atBlockEnd(device.getBody()); - - // Flow ops contain the DMA information, so create a map for easy lookup - // based on a global symbol. - DenseMap> symbolToFlowOps; - device.walk([&](FlowOp op) { - std::optional symbolAttr = op.getSymbol(); - if (symbolAttr) symbolToFlowOps[symbolAttr.value()].push_back(op); - }); - - DenseMap resourceMap; - for (ObjectFifoLinkOp linkOp : device.getOps()) { - if (failed(createBuffersAndLocks(builder, device, linkOp, resourceMap))) { - return signalPassFailure(); - } - } - - // Handle objectFifos that are not inside a link. - for (ObjectFifoCreateOp createOp : device.getOps()) { - if (failed(createBuffersAndLocksForNonLinkOps(builder, device, createOp, - resourceMap))) { - return signalPassFailure(); - } - } - - for (ObjectFifoCreateOp createOp : device.getOps()) { - if (failed(createTileDMAs(builder, device, createOp, resourceMap, - symbolToFlowOps))) { - return signalPassFailure(); - } - } - - // Replace ops - for (auto coreOp : device.getOps()) { - TileOp tileOp = - dyn_cast_if_present(coreOp.getTile().getDefiningOp()); - if (!tileOp) { - coreOp.emitOpError() - << "expected a TileOp, but got: " << coreOp.getTile(); - return signalPassFailure(); - } - WalkResult res = coreOp.walk([&](ObjectFifoReleaseOp releaseOp) { - if (failed(replaceReleaseOp(builder, releaseOp, tileOp, resourceMap))) { - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return signalPassFailure(); - // Use a map from objectFifos to indices to rotate through available - // buffers for double buffering purposes. - DenseMap createOpToIndex; - res = coreOp.walk([&](ObjectFifoAcquireOp acquireOp) { - if (failed(replaceObjectAcquireOp(builder, acquireOp, tileOp, - createOpToIndex, resourceMap))) { - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return signalPassFailure(); - } - - // make global symbols to replace the to be erased ObjectFifoCreateOps - for (auto createOp : device.getOps()) { - OpBuilder::InsertionGuard gg(builder); - builder.setInsertionPointToStart(&device.getBodyRegion().front()); - auto symName = createOp.getName(); - createOp->setAttr(SymbolTable::getSymbolAttrName(), - builder.getStringAttr("__erase_" + symName)); - auto memrefType = - cast(createOp.getElemType()).getElementType(); - builder.create(builder.getUnknownLoc(), symName, - builder.getStringAttr("public"), - memrefType, nullptr, false, nullptr); - } - - // Remove old ops - IRRewriter rewriter(&getContext()); - device.walk([&](Operation *op) { - if (isa(op)) { - op->dropAllUses(); - rewriter.eraseOp(op); - } - }); - } -}; - -std::unique_ptr> -createAMDAIEObjectFifoStatefulTransformPass() { - return std::make_unique(); -} - -void registerAMDAIEObjectFifoStatefulTransform() { - mlir::registerPass([]() -> std::unique_ptr { - return createAMDAIEObjectFifoStatefulTransformPass(); - }); -} - -} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt index 52244c48a..21167e9b1 100644 --- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt @@ -135,13 +135,11 @@ iree_cc_library( SRCS AMDAIEAssignBufferAddressesBasic.cpp AMDAIEAssignBufferDescriptorIDs.cpp - AMDAIEAssignLockIDs.cpp AMDAIECoreToStandard.cpp AMDAIECreatePathFindFlows.cpp AMDAIEDmaToNpu.cpp AMDAIELocalizeLocks.cpp AMDAIENormalizeAddressSpaces.cpp - AMDAIEObjectFifoStatefulTransform.cpp DEPS iree-amd-aie::aie_runtime::iree_aie_runtime_static ::AIEDialectIR diff --git a/compiler/plugins/target/AMD-AIE/aie/Passes.h b/compiler/plugins/target/AMD-AIE/aie/Passes.h index bf9e64477..a78c14c50 100644 --- a/compiler/plugins/target/AMD-AIE/aie/Passes.h +++ b/compiler/plugins/target/AMD-AIE/aie/Passes.h @@ -37,11 +37,9 @@ createAMDAIEDmaToNpuPass(); void registerAMDAIEAssignBufferAddressesBasic(); void registerAMDAIEAssignBufferDescriptorIDs(); -void registerAMDAIEAssignLockIDs(); void registerAMDAIECoreToStandard(); void registerAMDAIELocalizeLocks(); void registerAMDAIENormalizeAddressSpaces(); -void registerAMDAIEObjectFifoStatefulTransform(); void registerAMDAIERoutePathfinderFlows(); void registerAMDAIEDmaToNpu(); diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir deleted file mode 100644 index e703e57dd..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir +++ /dev/null @@ -1,181 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @fifo : memref -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_8_3:.*]] = aie.tile(8, 3) -// CHECK-DAG: %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_0"} : memref -// CHECK-DAG: %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_1"} : memref -// CHECK-DAG: %[[BUFFER_2_2_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_2"} : memref -// CHECK-DAG: %[[BUFFER_2_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_3"} : memref -// CHECK-DAG: %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_2_3:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_8_3:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_0"} : memref -// CHECK-DAG: %[[BUFFER_8_3_4:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_1"} : memref -// CHECK-DAG: %[[BUFFER_8_3_5:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_2"} : memref -// CHECK-DAG: %[[BUFFER_8_3_6:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_3"} : memref -// CHECK-DAG: %[[LOCK_8_3:.*]] = aie.lock(%[[TILE_8_3]]) {init = 4 : i8, sym_name = "fifo_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_8_3_7:.*]] = aie.lock(%[[TILE_8_3]]) {init = 0 : i8, sym_name = "fifo_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_8_3_8:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "buf83"} : memref<4xi32> -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_8_3]], DMA : 0) {symbol = @fifo} -// CHECK: %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) { -// CHECK: %[[C55_I32:.*]] = arith.constant 55 : i32 -// CHECK: %[[C66_I32:.*]] = arith.constant 66 : i32 -// CHECK: %[[C77_I32:.*]] = arith.constant 77 : i32 -// CHECK: %[[C88_I32:.*]] = arith.constant 88 : i32 -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C55_I32]], %[[BUFFER_2_2]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C66_I32]], %[[BUFFER_2_2_0]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C77_I32]], %[[BUFFER_2_2_1]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C88_I32]], %[[BUFFER_2_2_2]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_8_3:.*]] = aie.core(%[[TILE_8_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_0:.*]] = memref.load %[[BUFFER_8_3]][] : memref -// CHECK: memref.store %[[VAL_0]], %[[BUFFER_8_3_8]]{{\[}}%[[C0]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_8_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 2) -// CHECK: %[[VAL_1:.*]] = memref.load %[[BUFFER_8_3_4]][] : memref -// CHECK: %[[VAL_2:.*]] = memref.load %[[BUFFER_8_3_5]][] : memref -// CHECK: memref.store %[[VAL_1]], %[[BUFFER_8_3_8]]{{\[}}%[[C1]]] : memref<4xi32> -// CHECK: memref.store %[[VAL_2]], %[[BUFFER_8_3_8]]{{\[}}%[[C2]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_8_3]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_3:.*]] = memref.load %[[BUFFER_8_3_6]][] : memref -// CHECK: memref.store %[[VAL_3]], %[[BUFFER_8_3_8]]{{\[}}%[[C3]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_8_3]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_0]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_1]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_2]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_8_3:.*]] = aie.mem(%[[TILE_8_3]]) { -// CHECK: %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_8_3]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_8_3_4]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_8_3_5]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_8_3_6]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_8_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @aie2_cyclostatic_dma { - aie.device(xcve2302) { - %tile22 = aie.tile(2, 2) // producer tile - %tile83 = aie.tile(8, 3) // consumer tile - %buf83 = aie.buffer(%tile83) {sym_name = "buf83"} : memref<4xi32> - aie.flow(%tile22, DMA : 0, %tile83, DMA : 0) {symbol = @fifo} - // ObjectFifo that can hold 4 memrefs, populated by tile22 and - // consumed by tile83 - aie.objectfifo @fifo (%tile22, {%tile83}, 4 : i32) : !aie.objectfifo> - // Producer core - %core22 = aie.core(%tile22) { - %c55 = arith.constant 55 : i32 - %c66 = arith.constant 66 : i32 - %c77 = arith.constant 77 : i32 - %c88 = arith.constant 88 : i32 - // Push 55 - %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - memref.store %c55, %subview0_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 66 - %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - memref.store %c66, %subview1_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 77 - %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - memref.store %c77, %subview2_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 88 - %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref - memref.store %c88, %subview3_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - aie.end - } - // Consumer core - %core28 = aie.core(%tile83) { - // Consumer pattern: {1, 2, 1} - %i0 = arith.constant 0 : index - %i1 = arith.constant 1 : index - %i2 = arith.constant 2 : index - %i3 = arith.constant 3 : index - // Pop 1 object off queue - %subview0 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - %v55 = memref.load %subview0_obj[] : memref - memref.store %v55, %buf83[%i0] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 1) - // Pop 2 objects off queue - %subview1 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview> - %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref - %v66 = memref.load %subview1_obj0[] : memref - %v77 = memref.load %subview1_obj1[] : memref - memref.store %v66, %buf83[%i1] : memref<4xi32> - memref.store %v77, %buf83[%i2] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 2) - // Pop 1 object off queue - %subview2 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - %v88 = memref.load %subview2_obj[] : memref - memref.store %v88, %buf83[%i3] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 1) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir deleted file mode 100644 index 61091228a..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir +++ /dev/null @@ -1,182 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @fifo : memref -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK-DAG: %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_0"} : memref -// CHECK-DAG: %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_1"} : memref -// CHECK-DAG: %[[BUFFER_2_2_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_2"} : memref -// CHECK-DAG: %[[BUFFER_2_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_3"} : memref -// CHECK-DAG: %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_2_3:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_2_3:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_0"} : memref -// CHECK-DAG: %[[BUFFER_2_3_4:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_1"} : memref -// CHECK-DAG: %[[BUFFER_2_3_5:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_2"} : memref -// CHECK-DAG: %[[BUFFER_2_3_6:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_3"} : memref -// CHECK-DAG: %[[LOCK_2_3:.*]] = aie.lock(%[[TILE_2_3]]) {init = 4 : i8, sym_name = "fifo_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_3_7:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "fifo_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_2_3_8:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "buf23"} : memref<4xi32> -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_3]], DMA : 0) {symbol = @fifo} -// CHECK: %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) { -// CHECK: %[[C55_I32:.*]] = arith.constant 55 : i32 -// CHECK: %[[C66_I32:.*]] = arith.constant 66 : i32 -// CHECK: %[[C77_I32:.*]] = arith.constant 77 : i32 -// CHECK: %[[C88_I32:.*]] = arith.constant 88 : i32 -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C55_I32]], %[[BUFFER_2_2]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C66_I32]], %[[BUFFER_2_2_0]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C77_I32]], %[[BUFFER_2_2_1]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C88_I32]], %[[BUFFER_2_2_2]][] : memref -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_2_3:.*]] = aie.core(%[[TILE_2_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_0:.*]] = memref.load %[[BUFFER_2_3]][] : memref -// CHECK: memref.store %[[VAL_0]], %[[BUFFER_2_3_8]]{{\[}}%[[C0]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 2) -// CHECK: %[[VAL_1:.*]] = memref.load %[[BUFFER_2_3_4]][] : memref -// CHECK: %[[VAL_2:.*]] = memref.load %[[BUFFER_2_3_5]][] : memref -// CHECK: memref.store %[[VAL_1]], %[[BUFFER_2_3_8]]{{\[}}%[[C1]]] : memref<4xi32> -// CHECK: memref.store %[[VAL_2]], %[[BUFFER_2_3_8]]{{\[}}%[[C2]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_2_3]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_3:.*]] = memref.load %[[BUFFER_2_3_6]][] : memref -// CHECK: memref.store %[[VAL_3]], %[[BUFFER_2_3_8]]{{\[}}%[[C3]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[LOCK_2_3]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_0]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_1]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_2]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) { -// CHECK: %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_3]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_3_4]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_3_5]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_3_6]] : memref) {len = 1 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @aie2_cyclostatic_l1 { - aie.device(xcve2302) { - %tile22 = aie.tile(2, 2) // producer tile - %tile23 = aie.tile(2, 3) // consumer tile - %buf23 = aie.buffer(%tile23) {sym_name = "buf23"} : memref<4xi32> - aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @fifo} - // ObjectFifo that can hold 4 memrefs, populated by tile22 and - // consumed by tile23 - aie.objectfifo @fifo (%tile22, {%tile23}, 4 : i32) : !aie.objectfifo> - // Producer core - %core22 = aie.core(%tile22) { - %c55 = arith.constant 55 : i32 - %c66 = arith.constant 66 : i32 - %c77 = arith.constant 77 : i32 - %c88 = arith.constant 88 : i32 - // Push 55 - %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - memref.store %c55, %subview0_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 66 - %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - memref.store %c66, %subview1_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 77 - %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - memref.store %c77, %subview2_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // Push 88 - %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref - memref.store %c88, %subview3_obj[] : memref - aie.objectfifo.release @fifo (Produce, 1) - aie.end - } - // Consumer core - %core23 = aie.core(%tile23) { - // Consumer pattern: {1, 2, 1} - %i0 = arith.constant 0 : index - %i1 = arith.constant 1 : index - %i2 = arith.constant 2 : index - %i3 = arith.constant 3 : index - // Pop 1 object off queue - %subview0 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - %v55 = memref.load %subview0_obj[] : memref - memref.store %v55, %buf23[%i0] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 1) - // Pop 2 objects off queue - %subview1 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview> - %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref - %v66 = memref.load %subview1_obj0[] : memref - %v77 = memref.load %subview1_obj1[] : memref - memref.store %v66, %buf23[%i1] : memref<4xi32> - memref.store %v77, %buf23[%i2] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 2) - // Pop 1 object off queue - %subview2 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - %v88 = memref.load %subview2_obj[] : memref - memref.store %v88, %buf23[%i3] : memref<4xi32> - aie.objectfifo.release @fifo (Consume, 1) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir deleted file mode 100644 index 9cd02f03c..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir +++ /dev/null @@ -1,244 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @fifo1 : memref<1xi32> -// CHECK: memref.global "public" @fifo0 : memref<1xi32> -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_8_3:.*]] = aie.tile(8, 3) -// CHECK-DAG: %[[FIFO1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_0"} : memref<1xi32> -// CHECK-DAG: %[[FIFO1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_1"} : memref<1xi32> -// CHECK-DAG: %[[FIFO1_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_2"} : memref<1xi32> -// CHECK-DAG: %[[FIFO1_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_3"} : memref<1xi32> -// CHECK-DAG: %[[FIFO1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_8_3]]) {init = 4 : i8, sym_name = "fifo1_cons_prod_lock_0"} -// CHECK-DAG: %[[FIFO1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_8_3]]) {init = 0 : i8, sym_name = "fifo1_cons_cons_lock_0"} -// CHECK-DAG: %[[FIFO0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_0"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_1"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_2"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_3"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 4 : i8, sym_name = "fifo0_link_prod_lock_0"} -// CHECK-DAG: %[[FIFO0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "fifo0_link_cons_lock_0"} -// CHECK-DAG: %[[FIFO0_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_0"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_1"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_BUFF_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_2"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_BUFF_3:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_3"} : memref<1xi32> -// CHECK-DAG: %[[FIFO0_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo0_prod_prod_lock_0"} -// CHECK-DAG: %[[FIFO0_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo0_prod_cons_lock_0"} -// CHECK-DAG: %[[BUF83:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "buf83"} : memref<1xi32> -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_8_3]], DMA : 0) -// CHECK: %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C55_I32:.*]] = arith.constant 55 : i32 -// CHECK: %[[C66_I32:.*]] = arith.constant 66 : i32 -// CHECK: %[[C77_I32:.*]] = arith.constant 77 : i32 -// CHECK: %[[C88_I32:.*]] = arith.constant 88 : i32 -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C55_I32]], %[[FIFO0_BUFF_0]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C66_I32]], %[[FIFO0_BUFF_1]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C77_I32]], %[[FIFO0_BUFF_2]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C88_I32]], %[[FIFO0_BUFF_3]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_8_3:.*]] = aie.core(%[[TILE_8_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_0:.*]] = memref.load %[[FIFO1_CONS_BUFF_0]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: memref.store %[[VAL_0]], %[[BUF83]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 2) -// CHECK: %[[VAL_1:.*]] = memref.load %[[FIFO1_CONS_BUFF_1]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: %[[VAL_2:.*]] = memref.load %[[FIFO1_CONS_BUFF_2]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: memref.store %[[VAL_1]], %[[BUF83]]{{\[}}%[[C1]]] : memref<1xi32> -// CHECK: memref.store %[[VAL_2]], %[[BUF83]]{{\[}}%[[C2]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 2) -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_3:.*]] = memref.load %[[FIFO1_CONS_BUFF_3]]{{\[}}%[[C0]]] : memref<1xi32> -// CHECK: memref.store %[[VAL_3]], %[[BUF83]]{{\[}}%[[C3]]] : memref<1xi32> -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_BUFF_0]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_BUFF_1]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_BUFF_2]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_BUFF_3]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_6:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb10) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb9 -// CHECK: ^bb9: -// CHECK: aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO0_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb10: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_8_3:.*]] = aie.mem(%[[TILE_8_3]]) { -// CHECK: %[[VAL_7:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO1_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO1_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO1_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FIFO1_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32} -// CHECK: aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @aie2_cyclostatic_l2 { - aie.device(xcve2302) { - %tile22 = aie.tile(2, 2) // producer tile - %memtile = aie.tile(2, 1) // mem tile - %tile83 = aie.tile(8, 3) // consumer tile - %buf83 = aie.buffer(%tile83) {sym_name = "buf83"} : memref<1xi32> - aie.flow(%tile22, DMA : 0, %memtile, DMA : 0) {symbol = @fifo0} - aie.flow(%memtile, DMA : 0, %tile83, DMA : 0) {symbol = @fifo1} - // ObjectFifo that can hold 4 memref<1xi32>s, populated by tile22 and - // consumed by tile23 - aie.objectfifo @fifo0 (%tile22, {%memtile}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @fifo1 (%memtile, {%tile83}, [4, 4]) : !aie.objectfifo> - aie.objectfifo.link [@fifo0] -> [@fifo1] ([] []) - // Producer core - %core22 = aie.core(%tile22) { - %i0 = arith.constant 0 : index - %c55 = arith.constant 55 : i32 - %c66 = arith.constant 66 : i32 - %c77 = arith.constant 77 : i32 - %c88 = arith.constant 88 : i32 - // Push 55 - %subview0 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<1xi32> - memref.store %c55, %subview0_obj[%i0] : memref<1xi32> - aie.objectfifo.release @fifo0 (Produce, 1) - // Push 66 - %subview1 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview> - %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<1xi32> - memref.store %c66, %subview1_obj[%i0] : memref<1xi32> - aie.objectfifo.release @fifo0 (Produce, 1) - // Push 77 - %subview2 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<1xi32> - memref.store %c77, %subview2_obj[%i0] : memref<1xi32> - aie.objectfifo.release @fifo0 (Produce, 1) - // Push 88 - %subview3 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview> - %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<1xi32> - memref.store %c88, %subview3_obj[%i0] : memref<1xi32> - aie.objectfifo.release @fifo0 (Produce, 1) - aie.end - } - // Consumer core - %core28 = aie.core(%tile83) { - // Consumer pattern: {1, 2, 1} - %i0 = arith.constant 0 : index - %i1 = arith.constant 1 : index - %i2 = arith.constant 2 : index - %i3 = arith.constant 3 : index - // Pop 1 object off queue - %subview0 = aie.objectfifo.acquire @fifo1 (Consume, 1) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<1xi32> - %v55 = memref.load %subview0_obj[%i0] : memref<1xi32> - memref.store %v55, %buf83[%i0] : memref<1xi32> - aie.objectfifo.release @fifo1 (Consume, 1) - // Pop 2 objects off queue - %subview1 = aie.objectfifo.acquire @fifo1 (Consume, 2) : !aie.objectfifosubview> - %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<1xi32> - %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref<1xi32> - %v66 = memref.load %subview1_obj0[%i0] : memref<1xi32> - %v77 = memref.load %subview1_obj1[%i0] : memref<1xi32> - memref.store %v66, %buf83[%i1] : memref<1xi32> - memref.store %v77, %buf83[%i2] : memref<1xi32> - aie.objectfifo.release @fifo1 (Consume, 2) - // Pop 1 object off queue - %subview2 = aie.objectfifo.acquire @fifo1 (Consume, 1) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<1xi32> - %v88 = memref.load %subview2_obj[%i0] : memref<1xi32> - memref.store %v88, %buf83[%i3] : memref<1xi32> - aie.objectfifo.release @fifo1 (Consume, 1) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir deleted file mode 100644 index 902ae6250..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir +++ /dev/null @@ -1,125 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// Tests objectFifo between cores, xfailing for now. -// XFAIL: * -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @fifo : memref -// CHECK: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK: %[[FIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_0"} : memref -// CHECK: %[[FIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_1"} : memref -// CHECK: %[[FIFO_BUFF_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_2"} : memref -// CHECK: %[[FIFO_BUFF_3:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_3"} : memref -// CHECK: %[[FIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_lock"} -// CHECK: %[[FIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_cons_lock"} -// CHECK: %[[BUF23:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "buf23"} : memref<4xi32> -// CHECK: %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) { -// CHECK: %[[C99_I32:.*]] = arith.constant 99 : i32 -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C99_I32]], %[[FIFO_BUFF_0]][] : memref -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C99_I32]], %[[FIFO_BUFF_1]][] : memref -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C99_I32]], %[[FIFO_BUFF_2]][] : memref -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: memref.store %[[C99_I32]], %[[FIFO_BUFF_3]][] : memref -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_2_3:.*]] = aie.core(%[[TILE_2_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], AcquireGreaterEqual, 2) -// CHECK: %[[VAL_0:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref -// CHECK: memref.store %[[VAL_0]], %[[BUF23]]{{\[}}%[[C0]]] : memref<4xi32> -// CHECK: %[[VAL_1:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref -// CHECK: memref.store %[[VAL_1]], %[[BUF23]]{{\[}}%[[C1]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[FIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: %[[VAL_2:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref -// CHECK: memref.store %[[VAL_2]], %[[BUF23]]{{\[}}%[[C2]]] : memref<4xi32> -// CHECK: %[[VAL_3:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref -// CHECK: memref.store %[[VAL_3]], %[[BUF23]]{{\[}}%[[C3]]] : memref<4xi32> -// CHECK: aie.use_lock(%[[FIFO_PROD_LOCK]], Release, 3) -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @AIE2_delayed_release { - aie.device(xcve2302) { - %tile22 = aie.tile(2, 2) - %tile23 = aie.tile(2, 3) - %buf23 = aie.buffer(%tile23) {sym_name = "buf23"} : memref<4xi32> - aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @fifo} - aie.objectfifo @fifo (%tile22, {%tile23}, 4 : i32) : !aie.objectfifo> - // Producer -- produces one element at a time - %core22 = aie.core(%tile22) { - %c99 = arith.constant 99 : i32 - %i0 = arith.constant 0 : index - %i1 = arith.constant 1 : index - %i4 = arith.constant 4 : index - // Produce one 1 element (acquire producer lock) ... - %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview_obj0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - memref.store %c99, %subview_obj0[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // ... done producing (release consumer lock) - // Produce one 1 element (acquire producer lock) ... - %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview_obj1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - memref.store %c99, %subview_obj1[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // ... done producing (release consumer lock) - // Produce one 1 element (acquire producer lock) ... - %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview_obj2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - memref.store %c99, %subview_obj2[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // ... done producing (release consumer lock) - // Produce one 1 element (acquire producer lock) ... - %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview> - %subview_obj3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref - memref.store %c99, %subview_obj3[] : memref - aie.objectfifo.release @fifo (Produce, 1) - // ... done producing (release consumer lock) - aie.end - } - // Consumer -- consumes {2, 1, 3, 1}; releases {0, 0, 0, 2} - %core23 = aie.core(%tile23) { - %i0 = arith.constant 0 : index - %i1 = arith.constant 1 : index - %i2 = arith.constant 2 : index - %i3 = arith.constant 3 : index - // Begin consuming 2 elements (acquire consumer lock with value 2) - %subview0 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview> - %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref - %v0 = memref.load %subview0_obj[] : memref - memref.store %v0, %buf23[%i0] : memref<4xi32> - // For the next step, we only need one element (this could be a subroutine that acquires 1, not knowing that we already acquired 2) - %subview1 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref - %v1 = memref.load %subview1_obj[] : memref - memref.store %v1, %buf23[%i1] : memref<4xi32> - // Actually, give us the two from before and one more for three objects total (consumer lock should increase by one) - %subview2 = aie.objectfifo.acquire @fifo (Consume, 3) : !aie.objectfifosubview> - %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref - %v2 = memref.load %subview2_obj[] : memref - memref.store %v2, %buf23[%i2] : memref<4xi32> - // Now let's just work on one element (consumer lock should not change value) - %subview3 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview> - %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref - %v3 = memref.load %subview3_obj[] : memref - memref.store %v3, %buf23[%i3] : memref<4xi32> - // Done, let's release everything we hold (we hold 3 objects from our max acquire) - aie.objectfifo.release @fifo (Consume, 3) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir b/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir deleted file mode 100644 index 3bcf4ce7b..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir +++ /dev/null @@ -1,129 +0,0 @@ - -// RUN: iree-opt --amdaie-assign-lock-ids --split-input-file %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcvc1902) { -// CHECK: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK: %[[TILE_3_4:.*]] = aie.tile(3, 4) -// CHECK: %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]], 0) -// CHECK: %[[LOCK_2_2_0:.*]] = aie.lock(%[[TILE_2_2]], 2) -// CHECK: %[[LOCK_2_2_1:.*]] = aie.lock(%[[TILE_2_2]], 1) -// CHECK: %[[LOCK_2_3:.*]] = aie.lock(%[[TILE_2_3]], 0) -// CHECK: %[[LOCK_2_3_2:.*]] = aie.lock(%[[TILE_2_3]], 1) -// CHECK: %[[LOCK_2_3_3:.*]] = aie.lock(%[[TILE_2_3]], 4) -// CHECK: %[[LOCK_2_3_4:.*]] = aie.lock(%[[TILE_2_3]], 2) -// CHECK: %[[LOCK_2_3_5:.*]] = aie.lock(%[[TILE_2_3]], 3) -// CHECK: %[[LOCK_2_3_6:.*]] = aie.lock(%[[TILE_2_3]], 5) -// CHECK: %[[LOCK_2_3_7:.*]] = aie.lock(%[[TILE_2_3]], 6) -// CHECK: %[[LOCK_2_3_8:.*]] = aie.lock(%[[TILE_2_3]], 7) -// CHECK: %[[LOCK_2_3_9:.*]] = aie.lock(%[[TILE_2_3]], 10) -// CHECK: %[[LOCK_2_3_10:.*]] = aie.lock(%[[TILE_2_3]], 11) -// CHECK: %[[LOCK_2_3_11:.*]] = aie.lock(%[[TILE_2_3]], 8) -// CHECK: %[[LOCK_2_3_12:.*]] = aie.lock(%[[TILE_2_3]], 9) -// CHECK: %[[LOCK_2_3_13:.*]] = aie.lock(%[[TILE_2_3]], 12) -// CHECK: %[[LOCK_2_3_14:.*]] = aie.lock(%[[TILE_2_3]], 13) -// CHECK: %[[LOCK_2_3_15:.*]] = aie.lock(%[[TILE_2_3]], 14) -// CHECK: %[[LOCK_2_3_16:.*]] = aie.lock(%[[TILE_2_3]], 15) -// CHECK: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]], 0) -// CHECK: %[[LOCK_3_3_17:.*]] = aie.lock(%[[TILE_3_3]], 1) -// CHECK: %[[LOCK_3_3_18:.*]] = aie.lock(%[[TILE_3_3]], 9) -// CHECK: %[[LOCK_3_3_19:.*]] = aie.lock(%[[TILE_3_3]], 2) -// CHECK: %[[LOCK_3_4:.*]] = aie.lock(%[[TILE_3_4]], 0) -// CHECK: %[[LOCK_3_4_20:.*]] = aie.lock(%[[TILE_3_4]], 1) -// CHECK: %[[LOCK_3_4_21:.*]] = aie.lock(%[[TILE_3_4]], 2) -// CHECK: %[[LOCK_3_4_22:.*]] = aie.lock(%[[TILE_3_4]], 3) -// CHECK: %[[TILE_6_0:.*]] = aie.tile(6, 0) -// CHECK: %[[LOCK_6_0:.*]] = aie.lock(%[[TILE_6_0]], 0) -// CHECK: } - -module @test_assign_lockIDs { - aie.device(xcvc1902) { - %t22 = aie.tile(2, 2) - %t23 = aie.tile(2, 3) - %t33 = aie.tile(3, 3) - %t34 = aie.tile(3, 4) - %l22_0 = aie.lock(%t22, 0) - %l22_2 = aie.lock(%t22, 2) - %l22_1 = aie.lock(%t22) - %l23_0 = aie.lock(%t23) - %l23_1 = aie.lock(%t23) - %l23_4 = aie.lock(%t23, 4) - %l23_2 = aie.lock(%t23) - %l23_3 = aie.lock(%t23) - %l23_5 = aie.lock(%t23) - %l23_6 = aie.lock(%t23) - %l23_7 = aie.lock(%t23) - %l23_10 = aie.lock(%t23) - %l23_11 = aie.lock(%t23) - %l23_8 = aie.lock(%t23, 8) - %l23_9 = aie.lock(%t23, 9) - %l23_12 = aie.lock(%t23) - %l23_13 = aie.lock(%t23) - %l23_14 = aie.lock(%t23) - %l23_15 = aie.lock(%t23) - %l33_0 = aie.lock(%t33, 0) - %l33_1 = aie.lock(%t33) - %l33_9 = aie.lock(%t33, 9) - %l33_2 = aie.lock(%t33) - %l34_0 = aie.lock(%t34) - %l34_1 = aie.lock(%t34) - %l34_2 = aie.lock(%t34) - %l34_3 = aie.lock(%t34) - %t60 = aie.tile(6, 0) - %l60 = aie.lock(%t60) - } -} - -// ----- - -// CHECK-LABEL: aie.device(xcve2802) { -// CHECK: %[[TILE_1_1:.*]] = aie.tile(1, 1) -// CHECK: %[[LOCK_1_1:.*]] = aie.lock(%[[TILE_1_1]], 1) -// CHECK: %[[LOCK_1_1_0:.*]] = aie.lock(%[[TILE_1_1]], 0) -// CHECK: %[[LOCK_1_1_1:.*]] = aie.lock(%[[TILE_1_1]], 3) -// CHECK: %[[LOCK_1_1_2:.*]] = aie.lock(%[[TILE_1_1]], 4) -// CHECK: %[[LOCK_1_1_3:.*]] = aie.lock(%[[TILE_1_1]], 5) -// CHECK: %[[LOCK_1_1_4:.*]] = aie.lock(%[[TILE_1_1]], 6) -// CHECK: %[[LOCK_1_1_5:.*]] = aie.lock(%[[TILE_1_1]], 7) -// CHECK: %[[LOCK_1_1_6:.*]] = aie.lock(%[[TILE_1_1]], 8) -// CHECK: %[[LOCK_1_1_7:.*]] = aie.lock(%[[TILE_1_1]], 9) -// CHECK: %[[LOCK_1_1_8:.*]] = aie.lock(%[[TILE_1_1]], 10) -// CHECK: %[[LOCK_1_1_9:.*]] = aie.lock(%[[TILE_1_1]], 11) -// CHECK: %[[LOCK_1_1_10:.*]] = aie.lock(%[[TILE_1_1]], 12) -// CHECK: %[[LOCK_1_1_11:.*]] = aie.lock(%[[TILE_1_1]], 13) -// CHECK: %[[LOCK_1_1_12:.*]] = aie.lock(%[[TILE_1_1]], 14) -// CHECK: %[[LOCK_1_1_13:.*]] = aie.lock(%[[TILE_1_1]], 33) -// CHECK: %[[LOCK_1_1_14:.*]] = aie.lock(%[[TILE_1_1]], 15) -// CHECK: %[[LOCK_1_1_15:.*]] = aie.lock(%[[TILE_1_1]], 16) -// CHECK: %[[LOCK_1_1_16:.*]] = aie.lock(%[[TILE_1_1]], 17) -// CHECK: %[[LOCK_1_1_17:.*]] = aie.lock(%[[TILE_1_1]], 18) -// CHECK: %[[LOCK_1_1_18:.*]] = aie.lock(%[[TILE_1_1]], 2) -// CHECK: } - -module @memTileTest { - aie.device(xcve2802) { - // Memory tiles on xcve have 64 locks. - %tmemtile = aie.tile(1,1) - %l0 = aie.lock(%tmemtile, 1) - %l1 = aie.lock(%tmemtile, 0) - %l2 = aie.lock(%tmemtile) - %l3 = aie.lock(%tmemtile) - %l4 = aie.lock(%tmemtile) - %l5 = aie.lock(%tmemtile) - %l6 = aie.lock(%tmemtile) - %l7 = aie.lock(%tmemtile) - %l8 = aie.lock(%tmemtile) - %l9 = aie.lock(%tmemtile) - %l10 = aie.lock(%tmemtile) - %l11 = aie.lock(%tmemtile) - %l12 = aie.lock(%tmemtile) - %l13 = aie.lock(%tmemtile) - %l14 = aie.lock(%tmemtile,33) - %l15 = aie.lock(%tmemtile) - %l16 = aie.lock(%tmemtile) - %l17 = aie.lock(%tmemtile) - %l18 = aie.lock(%tmemtile) - %l19 = aie.lock(%tmemtile,2) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir deleted file mode 100644 index 205bfeea0..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir +++ /dev/null @@ -1,123 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @of1 : memref<16xi32> -// CHECK: memref.global "public" @of0 : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_6:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_7:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2_8:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_9:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_10:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_11:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_12:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_3_13:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @of1} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_1_3]], DMA : 0) {symbol = @of0} -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_6]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_7]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_10]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_11]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_12]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @elementGenerationAIE1 { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of1} - aie.flow(%tile12, DMA : 1, %tile13, DMA : 0) {symbol = @of0} - // In the shared memory case, the number of elements does not change. - aie.objectfifo @of0 (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo> - // In the non-adjacent memory case, the number of elements depends on the max amount acquired by - // the processes running on each core (here nothing is specified so it cannot be derived). - aie.objectfifo @of1 (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir deleted file mode 100644 index 160c2b596..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir +++ /dev/null @@ -1,123 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of1 : memref<16xi32> -// CHECK: memref.global "public" @of0 : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_6:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_7:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2_8:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_9:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_10:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_11:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_12:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_3_13:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @of1} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_1_3]], DMA : 0) {symbol = @of0} -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_6]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_7]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_8]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_10]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_11]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_12]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_13]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @elementGenerationAIE2 { - aie.device(xcve2302) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of1} - aie.flow(%tile12, DMA : 1, %tile13, DMA : 0) {symbol = @of0} - // In the shared memory case, the number of elements does not change. - aie.objectfifo @of0 (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo> - // In the non-adjacent memory case, the number of elements depends on the max amount acquired by - // the processes running on each core (here nothing is specified so it cannot be derived). - aie.objectfifo @of1 (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir deleted file mode 100644 index cbe78efec..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir +++ /dev/null @@ -1,374 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py - -// The script is designed to make adding checks to -// a test case fast, it is *not* designed to be authoritative -// about what constitutes a good test! The CHECK should be -// minimized and named to reflect the test intent. - - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @broadcast_of : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[TILE_1_4:.*]] = aie.tile(1, 4) -// CHECK-DAG: %[[TILE_3_2:.*]] = aie.tile(3, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "broadcast_of_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "broadcast_of_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 2 : i8, sym_name = "broadcast_of_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_3_1:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "broadcast_of_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "broadcast_of_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "broadcast_of_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_4:.*]] = aie.buffer(%[[TILE_1_4]]) {sym_name = "broadcast_of_cons_buff_1_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_4_4:.*]] = aie.buffer(%[[TILE_1_4]]) {sym_name = "broadcast_of_cons_buff_1_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_4:.*]] = aie.lock(%[[TILE_1_4]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_1"} -// CHECK-DAG: %[[LOCK_1_4_5:.*]] = aie.lock(%[[TILE_1_4]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_1"} -// CHECK-DAG: %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "broadcast_of_cons_buff_2_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_2_6:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "broadcast_of_cons_buff_2_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_2"} -// CHECK-DAG: %[[LOCK_3_2_7:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_2"} -// CHECK-DAG: %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "broadcast_of_cons_buff_3_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_8:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "broadcast_of_cons_buff_3_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_3"} -// CHECK-DAG: %[[LOCK_3_3_9:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_3"} -// CHECK-DAG: aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @broadcast_of} -// CHECK-DAG: aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @broadcast_of} -// CHECK-DAG: aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_1_4]], DMA : 0) {symbol = @broadcast_of} -// CHECK-DAG: aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @broadcast_of} -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_3:.*]] = aie.core(%[[TILE_1_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_3_1]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_3_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_3_1]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_1_4:.*]] = aie.core(%[[TILE_1_4]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C3]] { -// CHECK: aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_4]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_4]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_4]], Release, 2) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C4]] { -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C3]] { -// CHECK: aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_3_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_3_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_4:.*]] = aie.mem(%[[TILE_1_4]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_4]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_4_5]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_4]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_4_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_4_5]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2_6]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_7]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_9]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_8]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_9]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } - - -module @broadcast { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - %tile14 = aie.tile(1, 4) - %tile32 = aie.tile(3, 2) - %tile33 = aie.tile(3, 3) - aie.flow(%tile13, DMA : 0, %tile33, DMA : 0) {symbol = @broadcast_of} - aie.flow(%tile13, DMA : 0, %tile32, DMA : 0) {symbol = @broadcast_of} - aie.flow(%tile13, DMA : 0, %tile14, DMA : 0) {symbol = @broadcast_of} - aie.flow(%tile13, DMA : 0, %tile12, DMA : 0) {symbol = @broadcast_of} - aie.objectfifo @broadcast_of (%tile13, {%tile12, %tile14, %tile32, %tile33}, [2]) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core13 = aie.core(%tile13) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @broadcast_of (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Produce, 1) - %subview1 = aie.objectfifo.acquire @broadcast_of (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Produce, 1) - } - aie.end - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @broadcast_of (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - } - aie.end - } - %core14 = aie.core(%tile14) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c3 { - %subview = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 2) - %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem2 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem3 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem2) : (memref<16xi32>) -> () - func.call @some_work(%elem3) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 2) - %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem4 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem5 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem4) : (memref<16xi32>) -> () - func.call @some_work(%elem5) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 2) - } - aie.end - } - %core32 = aie.core(%tile32) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c4 { - %subview = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - %elem2 = aie.objectfifo.subview.access %subview[2] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - func.call @some_work(%elem1) : (memref<16xi32>) -> () - func.call @some_work(%elem2) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview> - %elem3 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem4 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref<16xi32> - %elem5 = aie.objectfifo.subview.access %subview1[2] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem3) : (memref<16xi32>) -> () - func.call @some_work(%elem4) : (memref<16xi32>) -> () - func.call @some_work(%elem5) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - - %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview> - %elem6 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem7 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview> -> memref<16xi32> - %elem8 = aie.objectfifo.subview.access %subview2[2] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem6) : (memref<16xi32>) -> () - func.call @some_work(%elem7) : (memref<16xi32>) -> () - func.call @some_work(%elem8) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - - %subview3 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview> - %elem9 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem10 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview> -> memref<16xi32> - %elem11 = aie.objectfifo.subview.access %subview3[2] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem9) : (memref<16xi32>) -> () - func.call @some_work(%elem10) : (memref<16xi32>) -> () - func.call @some_work(%elem11) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - } - aie.end - } - %core33 = aie.core(%tile33) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c3 { - %subview = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - - %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem2 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem3 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem2) : (memref<16xi32>) -> () - func.call @some_work(%elem3) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - - %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview> - %elem4 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem5 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem4) : (memref<16xi32>) -> () - func.call @some_work(%elem5) : (memref<16xi32>) -> () - aie.objectfifo.release @broadcast_of (Consume, 1) - } - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir deleted file mode 100644 index 38d37b926..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir +++ /dev/null @@ -1,80 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcvc1902) { -// CHECK: memref.global "public" @of2 : memref<16xi32> -// CHECK: memref.global "public" @of1 : memref<16xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[OF2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of2_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of2_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of2_cons_prod_lock_0"} -// CHECK-DAG: %[[OF2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of2_cons_cons_lock_0"} -// CHECK-DAG: %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_link_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_link_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_link_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_link_cons_lock_0"} -// CHECK-DAG: %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of1_prod_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_1_2]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32> -// CHECK-DAG: aie.shim_dma_allocation @of1(MM2S, 0, 2) -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF2_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF2_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @link_AIE1 { - aie.device(xcvc1902) { - %tile20 = aie.tile(2, 0) - %tile12 = aie.tile(1, 2) - %tile22 = aie.tile(2, 2) - aie.flow(%tile20, DMA : 0, %tile12, DMA : 0) {symbol = @of1} - aie.flow(%tile12, DMA : 0, %tile22, DMA : 0) {symbol = @of2} - aie.objectfifo @of1 (%tile20, {%tile12}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @of2 (%tile12, {%tile22}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@of1] -> [@of2] ([] []) - %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32> - aie.objectfifo.register_external_buffers @of1 (%tile20, {%ext_buff_in}) : (memref<16xi32>) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir deleted file mode 100644 index dbd14e8f2..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir +++ /dev/null @@ -1,80 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @from_memTile : memref<16xi32> -// CHECK: memref.global "public" @to_memTile : memref<16xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[FROM_MEMTILE_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "from_memTile_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[FROM_MEMTILE_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "from_memTile_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[FROM_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "from_memTile_cons_prod_lock_0"} -// CHECK-DAG: %[[FROM_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "from_memTile_cons_cons_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "to_memTile_link_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[TO_MEMTILE_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "to_memTile_link_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[TO_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "to_memTile_link_prod_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "to_memTile_link_cons_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "to_memTile_prod_prod_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "to_memTile_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32> -// CHECK-DAG: aie.shim_dma_allocation @to_memTile(MM2S, 0, 2) -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @link_DDR_L1 { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile21 = aie.tile(2, 1) - %tile22 = aie.tile(2, 2) - aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @to_memTile} - aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @from_memTile} - aie.objectfifo @to_memTile (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @from_memTile (%tile21, {%tile22}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@to_memTile] -> [@from_memTile] ([] []) - %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"}: memref<16xi32> - aie.objectfifo.register_external_buffers @to_memTile (%tile20, {%ext_buff_in}) : (memref<16xi32>) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir deleted file mode 100644 index c022a2a62..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir +++ /dev/null @@ -1,81 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @from_memTile : memref<48xi32> -// CHECK: memref.global "public" @to_memTile : memref<16xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[FROM_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "from_memTile_cons_prod_lock_0"} -// CHECK-DAG: %[[FROM_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "from_memTile_cons_cons_lock_0"} -// CHECK-DAG: %[[FROM_MEMTILE_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "from_memTile_link_buff_0_0"} : memref<48xi32> -// CHECK-DAG: %[[FROM_MEMTILE_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "from_memTile_link_buff_0_1"} : memref<48xi32> -// CHECK-DAG: %[[FROM_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "from_memTile_link_prod_lock_0"} -// CHECK-DAG: %[[FROM_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "from_memTile_link_cons_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "to_memTile_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[TO_MEMTILE_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "to_memTile_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[TO_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "to_memTile_prod_prod_lock_0"} -// CHECK-DAG: %[[TO_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "to_memTile_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_0]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<48xi32> -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[TO_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[TO_MEMTILE_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[TO_MEMTILE_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: aie.shim_dma_allocation @from_memTile(S2MM, 0, 2) -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_BUFF_0]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_BUFF_1]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_BUFF_0]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[FROM_MEMTILE_BUFF_1]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @link_L1_DDR { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile21 = aie.tile(2, 1) - %tile22 = aie.tile(2, 2) - aie.flow(%tile22, DMA : 0, %tile21, DMA : 0) {symbol = @to_memTile} - aie.flow(%tile21, DMA : 0, %tile20, DMA : 0) {symbol = @from_memTile} - aie.objectfifo @to_memTile (%tile22, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @from_memTile (%tile21, {%tile20}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@to_memTile] -> [@from_memTile] ([] []) - %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"}: memref<48xi32> - aie.objectfifo.register_external_buffers @from_memTile (%tile20, {%ext_buff_in}) : (memref<48xi32>) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir deleted file mode 100644 index 7379bdbe7..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir +++ /dev/null @@ -1,136 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @skip_connection : memref<16xi32> -// CHECK: memref.global "public" @link2 : memref<16xi32> -// CHECK: memref.global "public" @link1 : memref<48xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "skip_connection_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "skip_connection_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "skip_connection_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_2_1:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "skip_connection_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "skip_connection_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "skip_connection_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "skip_connection_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "skip_connection_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_2_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_0"} : memref<48xi32> -// CHECK-DAG: %[[BUFFER_2_1_4:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_1"} : memref<48xi32> -// CHECK-DAG: %[[LOCK_2_1:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "link1_link_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_1_5:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link1_link_cons_lock_0"} -// CHECK-DAG: %[[LOCK_2_0:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_0_6:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_2_2_7:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_2_2_8:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_2_2_9:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_2_2_10:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_3_11:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link2_cons_buff_1_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_12:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link2_cons_buff_1_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3_13:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_1"} -// CHECK-DAG: %[[LOCK_3_3_14:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_1"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0) {symbol = @link1} -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @link2} -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0) {symbol = @link2} -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_3_3]], DMA : 1) {symbol = @skip_connection} -// CHECK-DAG: aie.shim_dma_allocation @link1(MM2S, 0, 2) -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_1]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_1_5]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_1_4]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_1_5]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_2_1_5]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_1]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_1]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LOCK_2_1_5]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_1_4]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_1]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_2_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_7]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2_10]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_2_2_9]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_8]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2_10]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_3:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_2_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LOCK_2_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_2_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_2_2]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_3_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_11]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_14]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_3_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_12]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_14]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_5:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @link_broadcast { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile21 = aie.tile(2, 1) - %tile22 = aie.tile(2, 2) - %tile33 = aie.tile(3, 3) - aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @link1} - aie.flow(%tile21, DMA : 0, %tile33, DMA : 0) {symbol = @link2} - aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @link2} - aie.flow(%tile22, DMA : 0, %tile33, DMA : 1) {symbol = @skip_connection} - aie.objectfifo @link1 (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link2 (%tile21, {%tile22, %tile33}, [2]) : !aie.objectfifo> - aie.objectfifo @skip_connection (%tile22, {%tile33}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@link1] -> [@link2] ([] []) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir deleted file mode 100644 index 9eb8048e5..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir +++ /dev/null @@ -1,155 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @link4 : memref<12xi32> -// CHECK: memref.global "public" @link3 : memref<20xi32> -// CHECK: memref.global "public" @link2 : memref<4x4xi32> -// CHECK: memref.global "public" @link1 : memref<48xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[LINK4_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_cons_buff_2_0"} : memref<12xi32> -// CHECK-DAG: %[[LINK4_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_cons_buff_2_1"} : memref<12xi32> -// CHECK-DAG: %[[LINK4_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link4_cons_prod_lock_2"} -// CHECK-DAG: %[[LINK4_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link4_cons_cons_lock_2"} -// CHECK-DAG: %[[LINK3_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_cons_buff_1_0"} : memref<20xi32> -// CHECK-DAG: %[[LINK3_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_cons_buff_1_1"} : memref<20xi32> -// CHECK-DAG: %[[LINK3_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "link3_cons_prod_lock_1"} -// CHECK-DAG: %[[LINK3_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "link3_cons_cons_lock_1"} -// CHECK-DAG: %[[LINK2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_0"} : memref<4x4xi32> -// CHECK-DAG: %[[LINK2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_1"} : memref<4x4xi32> -// CHECK-DAG: %[[LINK2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_0"} -// CHECK-DAG: %[[LINK2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_0"} -// CHECK-DAG: %[[LINK1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_0"} : memref<48xi32> -// CHECK-DAG: %[[LINK1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_1"} : memref<48xi32> -// CHECK-DAG: %[[LINK1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 6 : i8, sym_name = "link1_link_prod_lock_0"} -// CHECK-DAG: %[[LINK1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link1_link_cons_lock_0"} -// CHECK-DAG: %[[LINK1_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_prod_lock_0"} -// CHECK-DAG: %[[LINK1_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 1, %[[TILE_2_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 2, %[[TILE_3_3]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<48xi32> -// CHECK-DAG: aie.shim_dma_allocation @link1(MM2S, 0, 2) -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], AcquireGreaterEqual, 3) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], Release, 3) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], AcquireGreaterEqual, 3) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 48 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], Release, 3) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb7, ^bb9) -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 20 : i32, offset = 16 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 20 : i32, offset = 16 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb9: -// CHECK: %[[VAL_3:.*]] = aie.dma_start(MM2S, 2, ^bb10, ^bb12) -// CHECK: ^bb10: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 12 : i32, offset = 36 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb11 -// CHECK: ^bb11: -// CHECK: aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 12 : i32, offset = 36 : i32} -// CHECK: aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb10 -// CHECK: ^bb12: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK2_CONS_BUFF_0]] : memref<4x4xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LINK2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK2_CONS_BUFF_1]] : memref<4x4xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LINK2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) { -// CHECK: %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK3_CONS_BUFF_0]] : memref<20xi32>) {len = 20 : i32} -// CHECK: aie.use_lock(%[[LINK3_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK3_CONS_BUFF_1]] : memref<20xi32>) {len = 20 : i32} -// CHECK: aie.use_lock(%[[LINK3_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_6:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK4_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK4_CONS_BUFF_0]] : memref<12xi32>) {len = 12 : i32} -// CHECK: aie.use_lock(%[[LINK4_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK4_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK4_CONS_BUFF_1]] : memref<12xi32>) {len = 12 : i32} -// CHECK: aie.use_lock(%[[LINK4_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @link_distribute { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile21 = aie.tile(2, 1) - %tile22 = aie.tile(2, 2) - %tile23 = aie.tile(2, 3) - %tile33 = aie.tile(3, 3) - aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @link1} - aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @link2} - aie.flow(%tile21, DMA : 1, %tile23, DMA : 0) {symbol = @link3} - aie.flow(%tile21, DMA : 2, %tile33, DMA : 0) {symbol = @link4} - aie.objectfifo @link1 (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link2 (%tile21, {%tile22}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link3 (%tile21, {%tile23}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link4 (%tile21, {%tile33}, 2 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<48xi32> - aie.objectfifo.register_external_buffers @link1 (%tile20, {%ext_buffer_in}) : (memref<48xi32>) - aie.objectfifo.link [@link1] -> [@link2, @link3, @link4] ([] []) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir deleted file mode 100644 index 101d94a45..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir +++ /dev/null @@ -1,191 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK-DAG: memref.global "public" @link5 : memref<512xi8> -// CHECK-DAG: memref.global "public" @link4 : memref<128xi8> -// CHECK-DAG: memref.global "public" @link3 : memref<128xi8> -// CHECK-DAG: memref.global "public" @link2 : memref<128xi8> -// CHECK-DAG: memref.global "public" @link1 : memref<128xi8> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[LINK5_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link5_cons_prod_lock_0"} -// CHECK-DAG: %[[LINK5_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link5_cons_cons_lock_0"} -// CHECK-DAG: %[[LINK5_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link5_link_buff_0_0"} : memref<512xi8> -// CHECK-DAG: %[[LINK5_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link5_link_buff_0_1"} : memref<512xi8> -// CHECK-DAG: %[[LINK5_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 8 : i8, sym_name = "link5_link_prod_lock_0"} -// CHECK-DAG: %[[LINK5_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link5_link_cons_lock_0"} -// CHECK-DAG: %[[LINK4_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_prod_buff_3_0"} : memref<128xi8> -// CHECK-DAG: %[[LINK4_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_prod_buff_3_1"} : memref<128xi8> -// CHECK-DAG: %[[LINK4_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link4_prod_prod_lock_3"} -// CHECK-DAG: %[[LINK4_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link4_prod_cons_lock_3"} -// CHECK-DAG: %[[LINK3_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_prod_buff_2_0"} : memref<128xi8> -// CHECK-DAG: %[[LINK3_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_prod_buff_2_1"} : memref<128xi8> -// CHECK-DAG: %[[LINK3_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "link3_prod_prod_lock_2"} -// CHECK-DAG: %[[LINK3_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "link3_prod_cons_lock_2"} -// CHECK-DAG: %[[LINK2_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_prod_buff_1_0"} : memref<128xi8> -// CHECK-DAG: %[[LINK2_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_prod_buff_1_1"} : memref<128xi8> -// CHECK-DAG: %[[LINK2_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_prod_prod_lock_1"} -// CHECK-DAG: %[[LINK2_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_prod_cons_lock_1"} -// CHECK-DAG: %[[LINK1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "link1_prod_buff_0_0"} : memref<128xi8> -// CHECK-DAG: %[[LINK1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "link1_prod_buff_0_1"} : memref<128xi8> -// CHECK-DAG: %[[LINK1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "link1_prod_prod_lock_0"} -// CHECK-DAG: %[[LINK1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 1) -// CHECK-DAG: aie.flow(%[[TILE_2_3]], DMA : 0, %[[TILE_2_1]], DMA : 2) -// CHECK-DAG: aie.flow(%[[TILE_3_3]], DMA : 0, %[[TILE_2_1]], DMA : 3) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_0]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<512xi8> -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_BUFF_0]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK1_BUFF_1]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 128 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 128 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 2, ^bb7, ^bb9) -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 256 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 256 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb9: -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 3, ^bb10, ^bb12) -// CHECK: ^bb10: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 384 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb11 -// CHECK: ^bb11: -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 384 : i32} -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb10 -// CHECK: ^bb12: -// CHECK: %[[VAL_5:.*]] = aie.dma_start(MM2S, 0, ^bb13, ^bb15) -// CHECK: ^bb13: -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], AcquireGreaterEqual, 4) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 512 : i32} -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], Release, 4) -// CHECK: aie.next_bd ^bb14 -// CHECK: ^bb14: -// CHECK: aie.use_lock(%[[LINK5_CONS_LOCK]], AcquireGreaterEqual, 4) -// CHECK: aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 512 : i32} -// CHECK: aie.use_lock(%[[LINK5_PROD_LOCK]], Release, 4) -// CHECK: aie.next_bd ^bb13 -// CHECK: ^bb15: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_6:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK2_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK2_BUFF_0]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK2_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK2_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK2_BUFF_1]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK2_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) { -// CHECK: %[[VAL_7:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK3_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK3_BUFF_0]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK3_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK3_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK3_BUFF_1]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK3_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: aie.shim_dma_allocation @link5(S2MM, 0, 2) -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_8:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LINK4_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK4_BUFF_0]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK4_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LINK4_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[LINK4_BUFF_1]] : memref<128xi8>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[LINK4_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @link_join { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile21 = aie.tile(2, 1) - %tile12 = aie.tile(1, 2) - %tile22 = aie.tile(2, 2) - %tile23 = aie.tile(2, 3) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile21, DMA : 0) {symbol = @link1} - aie.flow(%tile22, DMA : 0, %tile21, DMA : 1) {symbol = @link2} - aie.flow(%tile23, DMA : 0, %tile21, DMA : 2) {symbol = @link3} - aie.flow(%tile33, DMA : 0, %tile21, DMA : 3) {symbol = @link4} - aie.flow(%tile21, DMA : 0, %tile20, DMA : 0) {symbol = @link5} - aie.objectfifo @link1 (%tile12, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link2 (%tile22, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link3 (%tile23, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link4 (%tile33, {%tile21}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @link5 (%tile21, {%tile20}, 2 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<512xi8> - aie.objectfifo.register_external_buffers @link5 (%tile20, {%ext_buffer_in}) : (memref<512xi8>) - aie.objectfifo.link [@link1, @link2, @link3, @link4] -> [@link5] ([] []) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir deleted file mode 100644 index 49bc2bbad..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir +++ /dev/null @@ -1,188 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @outC : memref<16x16xi16> -// CHECK: memref.global "public" @inB : memref<8x16xi16> -// CHECK: memref.global "public" @inA : memref<16x8xi16> -// CHECK-DAG: %[[TILE_0_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_0_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[OUTC_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "outC_cons_prod_lock_0"} -// CHECK-DAG: %[[OUTC_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "outC_cons_cons_lock_0"} -// CHECK-DAG: %[[OUTC_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "outC_prod_buff_0_0"} : memref<16x16xi16> -// CHECK-DAG: %[[OUTC_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "outC_prod_buff_0_1"} : memref<16x16xi16> -// CHECK-DAG: %[[OUTC_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "outC_prod_prod_lock_0"} -// CHECK-DAG: %[[OUTC_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "outC_prod_cons_lock_0"} -// CHECK-DAG: %[[INB_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inB_cons_buff_0_0"} : memref<8x16xi16> -// CHECK-DAG: %[[INB_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inB_cons_buff_0_1"} : memref<8x16xi16> -// CHECK-DAG: %[[INB_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "inB_cons_prod_lock_0"} -// CHECK-DAG: %[[INB_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "inB_cons_cons_lock_0"} -// CHECK-DAG: %[[INB_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inB_prod_prod_lock_0"} -// CHECK-DAG: %[[INB_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inB_prod_cons_lock_0"} -// CHECK-DAG: %[[INA_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inA_cons_buff_0_0"} : memref<16x8xi16> -// CHECK-DAG: %[[INA_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inA_cons_buff_0_1"} : memref<16x8xi16> -// CHECK-DAG: %[[INA_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "inA_cons_prod_lock_0"} -// CHECK-DAG: %[[INA_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "inA_cons_cons_lock_0"} -// CHECK-DAG: %[[INA_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inA_prod_prod_lock_0"} -// CHECK-DAG: %[[INA_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inA_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_2]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_0_0]], DMA : 1, %[[TILE_0_2]], DMA : 1) -// CHECK-DAG: aie.flow(%[[TILE_0_2]], DMA : 0, %[[TILE_0_0]], DMA : 0) -// CHECK: func.func @zero_scalar_i16(%[[ARG0:.*]]: memref<16x16xi16>) { -// CHECK: return -// CHECK: } -// CHECK: func.func @matmul_scalar_i16_i16(%[[ARG0:.*]]: memref<16x8xi16>, %[[ARG1:.*]]: memref<8x16xi16>, %[[ARG2:.*]]: memref<16x16xi16>) { -// CHECK: return -// CHECK: } -// CHECK: aie.shim_dma_allocation @inA(MM2S, 0, 2) -// CHECK: %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C4294967295:.*]] = arith.constant 4294967295 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C4294967295]] step %[[C1]] { -// CHECK: scf.for %[[ARG1:.*]] = %[[C0]] to %[[C4]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OUTC_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @zero_scalar_i16(%[[OUTC_BUFF_0]]) : (memref<16x16xi16>) -> () -// CHECK: scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C2]] { -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_0]], %[[INB_CONS_BUFF_0]], %[[OUTC_BUFF_0]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_1]], %[[INB_CONS_BUFF_1]], %[[OUTC_BUFF_0]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.use_lock(%[[OUTC_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OUTC_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @zero_scalar_i16(%[[OUTC_BUFF_1]]) : (memref<16x16xi16>) -> () -// CHECK: scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C2]] { -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_0]], %[[INB_CONS_BUFF_0]], %[[OUTC_BUFF_1]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_1]], %[[INB_CONS_BUFF_1]], %[[OUTC_BUFF_1]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.use_lock(%[[OUTC_CONS_LOCK]], Release, 1) -// CHECK: } -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: aie.shim_dma_allocation @inB(MM2S, 1, 2) -// CHECK: aie.shim_dma_allocation @outC(S2MM, 0, 2) -// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[INA_CONS_BUFF_0]] : memref<16x8xi16>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[INA_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[INA_CONS_BUFF_1]] : memref<16x8xi16>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[INA_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[INB_CONS_BUFF_0]] : memref<8x16xi16>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[INB_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[INB_CONS_BUFF_1]] : memref<8x16xi16>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[INB_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 0, ^bb7, ^bb9) -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OUTC_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OUTC_BUFF_0]] : memref<16x16xi16>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OUTC_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[OUTC_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OUTC_BUFF_1]] : memref<16x16xi16>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OUTC_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb9: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @matmul { - aie.device(xcve2302) { - %t00 = aie.tile(2, 0) - %t02 = aie.tile(2, 2) - aie.flow(%t00, DMA : 0, %t02, DMA : 0) {symbol = @inA} - aie.flow(%t00, DMA : 1, %t02, DMA : 1) {symbol = @inB} - aie.flow(%t02, DMA : 0, %t00, DMA : 0) {symbol = @outC} - aie.objectfifo @inA (%t00, { %t02 }, 2 : i32) : !aie.objectfifo> - aie.objectfifo @inB (%t00, { %t02 }, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC (%t02, { %t00 }, 2 : i32) : !aie.objectfifo> - func.func @zero_scalar_i16(%elem0 : memref<16x16xi16>) -> () { return } - func.func @matmul_scalar_i16_i16(%elem0 : memref<16x8xi16>, %elem1 : memref<8x16xi16>, %elem2 : memref<16x16xi16>) -> () { return } - aie.core(%t02) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %intmax = arith.constant 0xFFFFFFFF : index - scf.for %reps = %c0 to %intmax step %c1 { - scf.for %arg2 = %c0 to %c4 step %c2 { - %subview4 = aie.objectfifo.acquire @outC (Produce, 1) : !aie.objectfifosubview> - %elem4 = aie.objectfifo.subview.access %subview4[0] : !aie.objectfifosubview> -> memref<16x16xi16> - func.call @zero_scalar_i16(%elem4) : (memref<16x16xi16>) -> () - scf.for %arg3 = %c0 to %c4 step %c2 { - %subview0 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<16x8xi16> - %subview1 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8x16xi16> - func.call @matmul_scalar_i16_i16(%elem0, %elem1, %elem4) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () - aie.objectfifo.release @inA (Consume, 1) - aie.objectfifo.release @inB (Consume, 1) - %subview2 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview> - %elem2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16x8xi16> - %subview3 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview> - %elem3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<8x16xi16> - func.call @matmul_scalar_i16_i16(%elem2, %elem3, %elem4) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () - aie.objectfifo.release @inA (Consume, 1) - aie.objectfifo.release @inB (Consume, 1) - } - aie.objectfifo.release @outC (Produce, 1) - %subview5 = aie.objectfifo.acquire @outC (Produce, 1) : !aie.objectfifosubview> - %elem5 = aie.objectfifo.subview.access %subview5[0] : !aie.objectfifosubview> -> memref<16x16xi16> - func.call @zero_scalar_i16(%elem5) : (memref<16x16xi16>) -> () - scf.for %arg3 = %c0 to %c4 step %c2 { - %subview0 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<16x8xi16> - %subview1 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8x16xi16> - func.call @matmul_scalar_i16_i16(%elem0, %elem1, %elem5) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () - aie.objectfifo.release @inA (Consume, 1) - aie.objectfifo.release @inB (Consume, 1) - %subview2 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview> - %elem2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16x8xi16> - %subview3 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview> - %elem3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<8x16xi16> - func.call @matmul_scalar_i16_i16(%elem2, %elem3, %elem5) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> () - aie.objectfifo.release @inA (Consume, 1) - aie.objectfifo.release @inB (Consume, 1) - } - aie.objectfifo.release @outC (Produce, 1) - } - } - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir deleted file mode 100644 index c13c13319..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir +++ /dev/null @@ -1,55 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of : memref<16xi32> -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[OF_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_cons_prod_lock_0"} -// CHECK-DAG: %[[OF_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"} -// CHECK-DAG: %[[OF_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "of_prod_prod_lock_0"} -// CHECK-DAG: %[[OF_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK: %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @memTile { - aie.device(xcve2302) { - %tile11 = aie.tile(2, 1) - %tile12 = aie.tile(2, 2) - aie.flow(%tile11, DMA : 0, %tile12, DMA : 0) {symbol = @of} - aie.objectfifo @of (%tile11, {%tile12}, 2 : i32) : !aie.objectfifo> - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir deleted file mode 100644 index f8b66fb1e..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir +++ /dev/null @@ -1,126 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of1 : memref<256xi32> -// CHECK: memref.global "public" @of0 : memref<256xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"} -// CHECK-DAG: %[[OF1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"} -// CHECK-DAG: %[[OF0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"} -// CHECK-DAG: %[[OF0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"} -// CHECK-DAG: %[[OF0_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<256xi32> -// CHECK-DAG: %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"} -// CHECK-DAG: %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 0) -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_0]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_1]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_2]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_3]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @ndDMAObjFifoAIE2 { - aie.device(xcve2302) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - %tile33 = aie.tile(3, 3) - // Even if an objectFifo could be implemented in shared memory, as with - // this case between two adjacent tiles, we need to use DMAs if a data - // layout transformation with toStream and fromStream was specified. - aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @of0} - aie.flow(%tile12, DMA : 1, %tile33, DMA : 0) {symbol = @of1} - aie.objectfifo @of0 (%tile12 toStream [, , ], // transpose - {%tile13 fromStream []}, - 4 : i32) : !aie.objectfifo> - aie.objectfifo @of1 (%tile12 toStream [], {%tile33}, - 2 : i32) : !aie.objectfifo> - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir deleted file mode 100644 index 87d830ca9..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir +++ /dev/null @@ -1,123 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of2 : memref<128xi32> -// CHECK: memref.global "public" @of1 : memref<128xi32> -// CHECK: memref.global "public" @of0 : memref<256xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_1:.*]] = aie.tile(2, 1) -// CHECK-DAG: %[[TILE_3_2:.*]] = aie.tile(3, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[OF2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of2_cons_buff_1_0"} : memref<128xi32> -// CHECK-DAG: %[[OF2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of2_cons_buff_1_1"} : memref<128xi32> -// CHECK-DAG: %[[OF2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of2_cons_prod_lock_1"} -// CHECK-DAG: %[[OF2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of2_cons_cons_lock_1"} -// CHECK-DAG: %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "of1_cons_buff_0_0"} : memref<128xi32> -// CHECK-DAG: %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "of1_cons_buff_0_1"} : memref<128xi32> -// CHECK-DAG: %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_2]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"} -// CHECK-DAG: %[[OF0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of0_link_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of0_link_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 4 : i8, sym_name = "of0_link_prod_lock_0"} -// CHECK-DAG: %[[OF0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "of0_link_cons_lock_0"} -// CHECK-DAG: %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of0_prod_prod_lock_0"} -// CHECK-DAG: %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_3_2]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_1]], DMA : 1, %[[TILE_3_3]], DMA : 0) -// CHECK-DAG aie.shim_dma_allocation @of0(MM2S, 0, 2) -// CHECK: %[[MEMTILE_DMA_1_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 2) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 2) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 2) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 2) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie, , , ]>, len = 128 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie, , , ]>, len = 128 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb7, ^bb9) -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie, , , ]>, len = 128 : i32, offset = 128 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie, , , ]>, len = 128 : i32, offset = 128 : i32} -// CHECK: aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb9: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_3_2]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<128xi32>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<128xi32>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF2_CONS_BUFF_0]] : memref<128xi32>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF2_CONS_BUFF_1]] : memref<128xi32>) {len = 128 : i32} -// CHECK: aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @ndDMAObjFifoAIE2 { - aie.device(xcve2302) { - %tile10 = aie.tile(2, 0) - %tile11 = aie.tile(2, 1) - %tile22 = aie.tile(3, 2) - %tile23 = aie.tile(3, 3) - aie.flow(%tile10, DMA : 0, %tile11, DMA : 0) {symbol = @of0} - aie.flow(%tile11, DMA : 0, %tile22, DMA : 0) {symbol = @of1} - aie.flow(%tile11, DMA : 1, %tile23, DMA : 0) {symbol = @of2} - aie.objectfifo @of0 (%tile10, {%tile11}, - 2 : i32) : !aie.objectfifo> - aie.objectfifo @of1 (%tile11 toStream [, - , - , - ], - {%tile22}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @of2 (%tile11 toStream [, - , - , - ], - {%tile23}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [ @of0 ] -> [ @of1, @of2 ] ([] []) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir deleted file mode 100644 index e3cbd0ca8..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir +++ /dev/null @@ -1,201 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of3 : memref<256xi32> -// CHECK: memref.global "public" @of1 : memref<256xi32> -// CHECK: memref.global "public" @of0 : memref<256xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK-DAG: %[[OF3_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of3_cons_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF3_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of3_cons_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF3_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "of3_cons_prod_lock_0"} -// CHECK-DAG: %[[OF3_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "of3_cons_cons_lock_0"} -// CHECK-DAG: %[[OF3_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of3_prod_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF3_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of3_prod_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF3_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of3_prod_prod_lock_0"} -// CHECK-DAG: %[[OF3_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of3_prod_cons_lock_0"} -// CHECK-DAG: %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"} -// CHECK-DAG: %[[OF1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"} -// CHECK-DAG: %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"} -// CHECK-DAG: %[[OF0_0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<256xi32> -// CHECK-DAG: %[[OF0_0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<256xi32> -// CHECK-DAG: %[[OF0_0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"} -// CHECK-DAG: %[[OF0_0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"} -// CHECK-DAG: %[[OF0_1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_1_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_2"} : memref<256xi32> -// CHECK-DAG: %[[OF0_1_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_3"} : memref<256xi32> -// CHECK-DAG: %[[OF0_1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_1"} -// CHECK-DAG: %[[OF0_1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_1"} -// CHECK-DAG: %[[OF0_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<256xi32> -// CHECK-DAG: %[[OF0_BUFF_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<256xi32> -// CHECK-DAG: %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"} -// CHECK-DAG: %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 1) -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_3]], DMA : 0) -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_0]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_1]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_2]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_BUFF_3]] : memref<256xi32>) {dimensions = #aie, , ]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_0_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_0_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_1_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_1_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_1_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF0_1_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_4:.*]] = aie.dma_start(S2MM, 1, ^bb6, ^bb8) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_5:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF3_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF3_BUFF_0]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF3_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF3_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF3_BUFF_1]] : memref<256xi32>) {len = 256 : i32} -// CHECK: aie.use_lock(%[[OF3_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) { -// CHECK: %[[VAL_6:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF3_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF3_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF3_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie]>, len = 256 : i32} -// CHECK: aie.use_lock(%[[OF3_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @ndDMAObjFifoAIE2 { - aie.device(xcve2302) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - %tile33 = aie.tile(3, 3) - %tile22 = aie.tile(2, 2) - %tile23 = aie.tile(2, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of0} - aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @of0} - aie.flow(%tile12, DMA : 1, %tile33, DMA : 1) {symbol = @of1} - aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @of3} - aie.objectfifo @of0 (%tile12 toStream [, , ], // transpose - {%tile13 fromStream [], - %tile33 fromStream []}, - 4 : i32) : !aie.objectfifo> - aie.objectfifo @of1 (%tile12 toStream [], {%tile33}, - 2 : i32) : !aie.objectfifo> - aie.objectfifo @of3 (%tile22, {%tile23 fromStream []}, - 2 : i32) : !aie.objectfifo> - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir deleted file mode 100644 index 06a3be4da..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir +++ /dev/null @@ -1,365 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @in8 : memref<32x32xi32> -// CHECK: memref.global "public" @in7 : memref<64x32xi32> -// CHECK: memref.global "public" @in2 : memref<32x64xi32> -// CHECK-DAG: %[[TILE_0_1:.*]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_0_2:.*]] = aie.tile(0, 2) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_0"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_1"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_2"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_3"} : memref<32x32xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in8_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in8_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_0"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_4:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_1"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_5:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_2"} : memref<32x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_6:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_3"} : memref<32x32xi32> -// CHECK-DAG: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in8_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_0_1_7:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in8_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_0_1_8:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_0"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_9:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_1"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_10:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_2"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_0_1_11:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_3"} : memref<64x32xi32> -// CHECK-DAG: %[[LOCK_0_1_12:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in7_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_0_1_13:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in7_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2_14:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_0"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_15:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_1"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_16:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_2"} : memref<64x32xi32> -// CHECK-DAG: %[[BUFFER_1_2_17:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_3"} : memref<64x32xi32> -// CHECK-DAG: %[[LOCK_1_2_18:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in7_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_19:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in7_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_0_1_20:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_0"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_1_21:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_1"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_1_22:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_2"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_1_23:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_3"} : memref<32x64xi32> -// CHECK-DAG: %[[LOCK_0_1_24:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in2_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_0_1_25:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in2_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_0"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_2_26:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_1"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_2_27:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_2"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_0_2_28:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_3"} : memref<32x64xi32> -// CHECK-DAG: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]]) {init = 4 : i8, sym_name = "in2_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_0_2_29:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "in2_cons_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2_30:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_0"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_1_2_31:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_1"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_1_2_32:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_2"} : memref<32x64xi32> -// CHECK-DAG: %[[BUFFER_1_2_33:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_3"} : memref<32x64xi32> -// CHECK-DAG: %[[LOCK_1_2_34:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in2_cons_prod_lock_1"} -// CHECK-DAG: %[[LOCK_1_2_35:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in2_cons_cons_lock_1"} -// CHECK-DAG: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @in2} -// CHECK-DAG: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) {symbol = @in2} -// CHECK-DAG: aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_1_2]], DMA : 1) {symbol = @in7} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_0_1]], DMA : 0) {symbol = @in8} -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK: %[[C8:.*]] = arith.constant 8 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C64:.*]] = arith.constant 64 : index -// CHECK: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %[[C960:.*]] = arith.constant 960 : index -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_1_2]] to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x32xi32> to memref<4x8x4x8xi32> -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], Release, 1) -// CHECK: scf.for %[[ARG0:.*]] = %[[C64]] to %[[C960]] step %[[C128]] { -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], AcquireGreaterEqual, 1) -// CHECK: %[[REINTERPRET_CAST_36:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_30]] to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32> -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], AcquireGreaterEqual, 1) -// CHECK: %[[REINTERPRET_CAST_37:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_14]] to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32> -// CHECK: scf.for %[[ARG1:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: %[[VAL_0:.*]] = memref.load %[[REINTERPRET_CAST_36]]{{\[}}%[[ARG3]], %[[ARG1]], %[[ARG4]], %[[ARG6]]] : memref<8x8x4x8xi32> -// CHECK: %[[VAL_1:.*]] = memref.load %[[REINTERPRET_CAST_37]]{{\[}}%[[ARG2]], %[[ARG3]], %[[ARG6]], %[[ARG5]]] : memref<4x8x8x8xi32> -// CHECK: %[[VAL_2:.*]] = memref.load %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32> -// CHECK: %[[VAL_3:.*]] = arith.muli %[[VAL_0]], %[[VAL_1]] : i32 -// CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : i32 -// CHECK: memref.store %[[VAL_4]], %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32> -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], AcquireGreaterEqual, 1) -// CHECK: %[[REINTERPRET_CAST_38:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_31]] to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32> -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], AcquireGreaterEqual, 1) -// CHECK: %[[REINTERPRET_CAST_39:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_15]] to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32> -// CHECK: scf.for %[[ARG1:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C8]] step %[[C1]] { -// CHECK: %[[VAL_5:.*]] = memref.load %[[REINTERPRET_CAST_38]]{{\[}}%[[ARG3]], %[[ARG1]], %[[ARG4]], %[[ARG6]]] : memref<8x8x4x8xi32> -// CHECK: %[[VAL_6:.*]] = memref.load %[[REINTERPRET_CAST_39]]{{\[}}%[[ARG2]], %[[ARG3]], %[[ARG6]], %[[ARG5]]] : memref<4x8x8x8xi32> -// CHECK: %[[VAL_7:.*]] = memref.load %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32> -// CHECK: %[[VAL_8:.*]] = arith.muli %[[VAL_5]], %[[VAL_6]] : i32 -// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_7]], %[[VAL_8]] : i32 -// CHECK: memref.store %[[VAL_9]], %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32> -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { -// CHECK: %[[VAL_10:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_20]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_24]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_21]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_24]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_22]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_24]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_23]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_24]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_11:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb10) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_8]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_12]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_9]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_12]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_10]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_12]], Release, 1) -// CHECK: aie.next_bd ^bb9 -// CHECK: ^bb9: -// CHECK: aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_11]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_12]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb10: -// CHECK: %[[VAL_12:.*]] = aie.dma_start(S2MM, 0, ^bb11, ^bb15) -// CHECK: ^bb11: -// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_7]], Release, 1) -// CHECK: aie.next_bd ^bb12 -// CHECK: ^bb12: -// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_4]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_7]], Release, 1) -// CHECK: aie.next_bd ^bb13 -// CHECK: ^bb13: -// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_5]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_7]], Release, 1) -// CHECK: aie.next_bd ^bb14 -// CHECK: ^bb14: -// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_1_6]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_1_7]], Release, 1) -// CHECK: aie.next_bd ^bb11 -// CHECK: ^bb15: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { -// CHECK: %[[VAL_13:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_2_29]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_2_26]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_2_29]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_2_27]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_2_29]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_0_2_28]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_0_2_29]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_14:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_30]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_31]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_32]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_33]] : memref<32x64xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_35]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: %[[VAL_15:.*]] = aie.dma_start(S2MM, 1, ^bb6, ^bb10) -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_14]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_15]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], Release, 1) -// CHECK: aie.next_bd ^bb8 -// CHECK: ^bb8: -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_16]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], Release, 1) -// CHECK: aie.next_bd ^bb9 -// CHECK: ^bb9: -// CHECK: aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_17]] : memref<64x32xi32>) {len = 2048 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_19]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb10: -// CHECK: %[[VAL_16:.*]] = aie.dma_start(MM2S, 0, ^bb11, ^bb15) -// CHECK: ^bb11: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb12 -// CHECK: ^bb12: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb13 -// CHECK: ^bb13: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_1]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb14 -// CHECK: ^bb14: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_2]] : memref<32x32xi32>) {len = 1024 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb11 -// CHECK: ^bb15: -// CHECK: aie.end -// CHECK: } -// CHECK: } -aie.device(npu1_4col) { - %tile_0_1 = aie.tile(0, 1) - %tile_1_2 = aie.tile(1, 2) - %tile_0_2 = aie.tile(0, 2) - aie.flow(%tile_0_1, DMA : 0, %tile_1_2, DMA : 0) {symbol = @in2} - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) {symbol = @in2} - aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 1) {symbol = @in7} - aie.flow(%tile_1_2, DMA : 0, %tile_0_1, DMA : 0) {symbol = @in8} - aie.objectfifo @in2(%tile_0_1, {%tile_0_2, %tile_1_2}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @in7(%tile_0_1, {%tile_1_2}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @in8(%tile_1_2, {%tile_0_1}, 4 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c8 = arith.constant 8 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c960 = arith.constant 960 : index - %0 = aie.objectfifo.acquire @in8(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<32x32xi32> - %reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x32xi32> to memref<4x8x4x8xi32> - aie.objectfifo.release @in2(Consume, 1) - aie.objectfifo.release @in7(Consume, 1) - scf.for %arg0 = %c64 to %c960 step %c128 { - %10 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x64xi32> - %reinterpret_cast_4 = memref.reinterpret_cast %11 to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32> - %12 = aie.objectfifo.acquire @in7(Consume, 1) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<64x32xi32> - %reinterpret_cast_5 = memref.reinterpret_cast %13 to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32> - scf.for %arg1 = %c0 to %c8 step %c1 { - scf.for %arg2 = %c0 to %c4 step %c1 { - scf.for %arg3 = %c0 to %c8 step %c1 { - scf.for %arg4 = %c0 to %c4 step %c1 { - scf.for %arg5 = %c0 to %c8 step %c1 { - scf.for %arg6 = %c0 to %c8 step %c1 { - %14 = memref.load %reinterpret_cast_4[%arg3, %arg1, %arg4, %arg6] : memref<8x8x4x8xi32> - %15 = memref.load %reinterpret_cast_5[%arg2, %arg3, %arg6, %arg5] : memref<4x8x8x8xi32> - %16 = memref.load %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32> - %17 = arith.muli %14, %15 : i32 - %18 = arith.addi %16, %17 : i32 - memref.store %18, %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32> - } - } - } - } - } - } - aie.objectfifo.release @in2(Consume, 1) - aie.objectfifo.release @in7(Consume, 1) - %19 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview> - %20 = aie.objectfifo.subview.access %19[0] : !aie.objectfifosubview> -> memref<32x64xi32> - %reinterpret_cast_6 = memref.reinterpret_cast %20 to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32> - %21 = aie.objectfifo.acquire @in7(Consume, 1) : !aie.objectfifosubview> - %22 = aie.objectfifo.subview.access %21[0] : !aie.objectfifosubview> -> memref<64x32xi32> - %reinterpret_cast_7 = memref.reinterpret_cast %22 to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32> - scf.for %arg1 = %c0 to %c8 step %c1 { - scf.for %arg2 = %c0 to %c4 step %c1 { - scf.for %arg3 = %c0 to %c8 step %c1 { - scf.for %arg4 = %c0 to %c4 step %c1 { - scf.for %arg5 = %c0 to %c8 step %c1 { - scf.for %arg6 = %c0 to %c8 step %c1 { - %23 = memref.load %reinterpret_cast_6[%arg3, %arg1, %arg4, %arg6] : memref<8x8x4x8xi32> - %24 = memref.load %reinterpret_cast_7[%arg2, %arg3, %arg6, %arg5] : memref<4x8x8x8xi32> - %25 = memref.load %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32> - %26 = arith.muli %23, %24 : i32 - %27 = arith.addi %25, %26 : i32 - memref.store %27, %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32> - } - } - } - } - } - } - aie.objectfifo.release @in2(Consume, 1) - aie.objectfifo.release @in7(Consume, 1) - } - aie.end - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir deleted file mode 100644 index 5e5349a8b..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir +++ /dev/null @@ -1,125 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[OBJFIFO_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OBJFIFO_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OBJFIFO_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock_0"} -// CHECK-DAG: %[[OBJFIFO_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"} -// CHECK-DAG: %[[OBJFIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OBJFIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OBJFIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "objfifo_prod_prod_lock_0"} -// CHECK-DAG: %[[OBJFIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_CONS_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OBJFIFO_CONS_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OBJFIFO_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @non_adjacency { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @objfifo} - aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - } - aie.end - } - %core33 = aie.core(%tile33) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 1) - %subview1 = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 1) - } - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir deleted file mode 100644 index 480e9cfea..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir +++ /dev/null @@ -1,139 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "objfifo_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @objfifo} -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C4]] { -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 2) -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_3]], Release, 2) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_3_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @non_adjacency { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @objfifo} - aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Produce, 1) - } - aie.end - } - %core33 = aie.core(%tile33) { - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c4 { - %subview = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 2) - %subview1 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview> - %elem3 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem4 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem3) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 2) - %subview2 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview> - %elem6 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem7 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem6) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 2) - %subview3 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview> - %elem9 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem10 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem9) : (memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 2) - } - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir deleted file mode 100644 index 7ae508421..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir +++ /dev/null @@ -1,122 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[OF_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of_cons_prod_lock_0"} -// CHECK-DAG: %[[OF_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"} -// CHECK-DAG: %[[OF_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of_prod_prod_lock_0"} -// CHECK-DAG: %[[OF_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OF_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OF_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index -// CHECK: scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] { -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OF_CONS_BUFF_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], Release, 1) -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[OF_CONS_BUFF_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], Release, 1) -// CHECK: } -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } - -module @non_adjacency_AIE2 { - aie.device(xcve2302) { - %tile12 = aie.tile(1, 2) - %tile33 = aie.tile(3, 3) - aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of} - aie.objectfifo @of (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> - func.func @some_work(%lineOut : memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Produce, 1) - %subview1 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Produce, 1) - } - aie.end - } - %core33 = aie.core(%tile33) { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %height = arith.constant 12 : index - scf.for %indexInHeight = %c0 to %height step %c2 { - %subview = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Consume, 1) - %subview1 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview> - %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem1) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Consume, 1) - } - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir deleted file mode 100644 index 05a734695..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir +++ /dev/null @@ -1,75 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @ext_of : memref<16xi32> -// CHECK-DAG: %[[TILE_3_2:.*]] = aie.tile(3, 2) -// CHECK-DAG: %[[TILE_3_0:.*]] = aie.tile(3, 0) -// CHECK-DAG: %[[LOCK_3_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "ext_of_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_0_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "ext_of_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_2_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_2_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 3 : i8, sym_name = "ext_of_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_2_3:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "ext_of_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_3_0]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @ext_of} -// CHECK: %[[VAL_0:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32> -// CHECK: aie.objectfifo.register_external_buffers @ext_of(%[[TILE_3_0]], {%[[VAL_0]]}) : (memref<64xi32>) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: aie.shim_dma_allocation @ext_of(MM2S, 0, 3) -// CHECK: %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_3_2]], %[[BUFFER_3_2_1]]) : (memref<16xi32>, memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 3) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb4) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb4: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @register_external_buffers { - aie.device(npu1_4col) { - %tile32 = aie.tile(3, 2) - %tile30 = aie.tile(3, 0) - aie.flow(%tile30, DMA : 0, %tile32, DMA : 0) {symbol = @ext_of} - aie.objectfifo @ext_of (%tile30, {%tile32}, 3 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32> - aie.objectfifo.register_external_buffers @ext_of (%tile30, {%ext_buffer_in}) : (memref<64xi32>) - func.func @some_work(%a : memref<16xi32>, %b : memref<16xi32>) -> () { - return - } - %core71 = aie.core(%tile32) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %height = arith.constant 12 : index - %subview = aie.objectfifo.acquire @ext_of (Consume, 3) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0, %elem1) : (memref<16xi32>, memref<16xi32>) -> () - aie.objectfifo.release @ext_of (Consume, 3) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir deleted file mode 100644 index 3bec7a260..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir +++ /dev/null @@ -1,103 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 3 : i8, sym_name = "of_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_2_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2_6:.*]] = aie.lock(%[[TILE_1_2]]) {init = 3 : i8, sym_name = "of_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_7:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @of} -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_7]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_4]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_6]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_2]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2_7]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_3]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_6]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb4) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb4: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb5, ^bb8) -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_7]], Release, 1) -// CHECK: aie.next_bd ^bb6 -// CHECK: ^bb6: -// CHECK: aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_7]], Release, 1) -// CHECK: aie.next_bd ^bb7 -// CHECK: ^bb7: -// CHECK: aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2_7]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb8: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @same_core { - aie.device(xcve2302) { - %tile12 = aie.tile(1, 2) - aie.flow(%tile12, DMA : 0, %tile12, DMA : 0) {symbol = @of} - aie.objectfifo @of (%tile12, {%tile12}, 3 : i32) : !aie.objectfifo> - func.func @some_work(%line_in:memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - // this acquires 2 elements - %subview0 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview> - %elem00 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem00) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Produce, 1) - %subview1 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview> - %elem10 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem10) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Consume, 1) - %subview2 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview> - %elem20 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem20) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Produce, 1) - %subview3 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview> - %elem30 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem30) : (memref<16xi32>) -> () - aie.objectfifo.release @of (Consume, 1) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir deleted file mode 100644 index 3d636db89..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir +++ /dev/null @@ -1,75 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK-DAG: %[[TILE_3_2:.*]] = aie.tile(3, 2) -// CHECK-DAG: %[[TILE_3_0:.*]] = aie.tile(3, 0) -// CHECK-DAG: %[[LOCK_3_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "objfifo_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_0_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_2_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_3_2_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 3 : i8, sym_name = "objfifo_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_3_2_3:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_3_0]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @objfifo} -// CHECK-DAG: %[[VAL_0:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32> -// CHECK-DAG: aie.objectfifo.register_external_buffers @objfifo(%[[TILE_3_0]], {%[[VAL_0]]}) : (memref<64xi32>) -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: aie.shim_dma_allocation @objfifo(MM2S, 0, 3) -// CHECK: %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_3_2]], %[[BUFFER_3_2_1]]) : (memref<16xi32>, memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_3_2]], Release, 1) -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb4) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_3_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_3_2_3]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb4: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @shimRow_mem { - aie.device(npu1_4col) { - %tile32 = aie.tile(3, 2) - %tile30 = aie.tile(3, 0) - aie.flow(%tile30, DMA : 0, %tile32, DMA : 0) {symbol = @objfifo} - aie.objectfifo @objfifo (%tile30, {%tile32}, 3 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32> - aie.objectfifo.register_external_buffers @objfifo (%tile30, {%ext_buffer_in}) : (memref<64xi32>) - func.func @some_work(%a : memref<16xi32>, %b : memref<16xi32>) -> () { - return - } - %core71 = aie.core(%tile32) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %height = arith.constant 12 : index - %subview = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview> - %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem0, %elem1) : (memref<16xi32>, memref<16xi32>) -> () - aie.objectfifo.release @objfifo (Consume, 1) - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir deleted file mode 100644 index 7047e8b69..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir +++ /dev/null @@ -1,68 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of_out : memref<16xi32> -// CHECK: memref.global "public" @of_in : memref<16xi32> -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[OF_OUT_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_out_cons_prod_lock_0"} -// CHECK-DAG: %[[OF_OUT_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_out_cons_cons_lock_0"} -// CHECK-DAG: %[[OF_OUT_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_out_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_OUT_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_out_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_OUT_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_out_prod_prod_lock_0"} -// CHECK-DAG: %[[OF_OUT_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_out_prod_cons_lock_0"} -// CHECK-DAG: %[[OF_IN_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_0"} -// CHECK-DAG: %[[OF_IN_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_0"} -// CHECK-DAG: %[[OF_IN_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_prod_lock_0"} -// CHECK-DAG: %[[OF_IN_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_0]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32> -// CHECK-DAG: %[[EXT_BUFFER_OUT:.*]] = aie.external_buffer {sym_name = "ext_buffer_out"} : memref<64xi32> -// CHECK-DAG: aie.shim_dma_allocation @of_in(MM2S, 0, 2) -// CHECK-DAG: aie.shim_dma_allocation @of_out(S2MM, 0, 2) -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_IN_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_IN_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6) -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[OF_OUT_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_OUT_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_OUT_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb5 -// CHECK: ^bb5: -// CHECK: aie.use_lock(%[[OF_OUT_CONS_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_OUT_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_OUT_PROD_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb6: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @shim_AIE2 { - aie.device(xcve2302) { - %tile22 = aie.tile(2, 2) - %tile20 = aie.tile(2, 0) - aie.flow(%tile20, DMA : 0, %tile22, DMA : 0) {symbol = @of_in} - aie.flow(%tile22, DMA : 0, %tile20, DMA : 0) {symbol = @of_out} - aie.objectfifo @of_in (%tile20, {%tile22}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @of_out (%tile22, {%tile20}, 2 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32> - %ext_buffer_out = aie.external_buffer {sym_name = "ext_buffer_out"}: memref<64xi32> - aie.objectfifo.register_external_buffers @of_in (%tile20, {%ext_buffer_in}) : (memref<64xi32>) - aie.objectfifo.register_external_buffers @of_out (%tile20, {%ext_buffer_out}) : (memref<64xi32>) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir deleted file mode 100644 index 8d0179c40..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir +++ /dev/null @@ -1,88 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(xcve2302) { -// CHECK: memref.global "public" @of_in : memref<16xi32> -// CHECK-DAG: %[[TILE_2_0:.*]] = aie.tile(2, 0) -// CHECK-DAG: %[[TILE_2_2:.*]] = aie.tile(2, 2) -// CHECK-DAG: %[[TILE_2_3:.*]] = aie.tile(2, 3) -// CHECK-DAG: %[[TILE_3_3:.*]] = aie.tile(3, 3) -// CHECK-DAG: %[[OF_IN_0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_0"} -// CHECK-DAG: %[[OF_IN_0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_0"} -// CHECK-DAG: %[[OF_IN_1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of_in_cons_buff_1_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of_in_cons_buff_1_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_1"} -// CHECK-DAG: %[[OF_IN_1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_1"} -// CHECK-DAG: %[[OF_IN_2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_in_cons_buff_2_0"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_in_cons_buff_2_1"} : memref<16xi32> -// CHECK-DAG: %[[OF_IN_2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_2"} -// CHECK-DAG: %[[OF_IN_2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_2"} -// CHECK-DAG: %[[OF_IN_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_prod_lock_0"} -// CHECK-DAG: %[[OF_IN_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_3_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_3]], DMA : 0) -// CHECK-DAG: aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_2]], DMA : 0) -// CHECK-DAG: %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32> -// CHECK-DAG: aie.shim_dma_allocation @of_in(MM2S, 0, 2) -// CHECK: %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_IN_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_0_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_IN_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_0_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_0_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_IN_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_IN_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_1_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) { -// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[OF_IN_2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_2_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[OF_IN_2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[OF_IN_2_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[OF_IN_2_CONS_CONS_LOCK]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb3: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @shim_broadcast { - aie.device(xcve2302) { - %tile20 = aie.tile(2, 0) - %tile22 = aie.tile(2, 2) - %tile23 = aie.tile(2, 3) - %tile33 = aie.tile(3, 3) - aie.flow(%tile20, DMA : 0, %tile33, DMA : 0) {symbol = @of_in} - aie.flow(%tile20, DMA : 0, %tile23, DMA : 0) {symbol = @of_in} - aie.flow(%tile20, DMA : 0, %tile22, DMA : 0) {symbol = @of_in} - aie.objectfifo @of_in (%tile20, {%tile22, %tile23, %tile33}, 2 : i32) : !aie.objectfifo> - %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32> - aie.objectfifo.register_external_buffers @of_in (%tile20, {%ext_buffer_in}) : (memref<64xi32>) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir deleted file mode 100644 index 44ef00a9a..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir +++ /dev/null @@ -1,132 +0,0 @@ - -// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @objfifo : memref<16xi32> -// CHECK-DAG: %[[TILE_1_2:.*]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_1_3:.*]] = aie.tile(1, 3) -// CHECK-DAG: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "objfifo_prod_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"} -// CHECK-DAG: %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_4:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_5:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_2"} : memref<16xi32> -// CHECK-DAG: %[[BUFFER_1_3_6:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_3"} : memref<16xi32> -// CHECK-DAG: %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "objfifo_cons_prod_lock_0"} -// CHECK-DAG: %[[LOCK_1_3_7:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"} -// CHECK-DAG: aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0) {symbol = @objfifo} -// CHECK: func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 3) -// CHECK: func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) -// CHECK: func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 3) -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], Release, 1) -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> () -// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 2) -// CHECK: func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> () -// CHECK: func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> () -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { -// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_1]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_2]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) { -// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5) -// CHECK: ^bb1: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb2 -// CHECK: ^bb2: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_4]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb3 -// CHECK: ^bb3: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_5]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb4 -// CHECK: ^bb4: -// CHECK: aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1) -// CHECK: aie.dma_bd(%[[BUFFER_1_3_6]] : memref<16xi32>) {len = 16 : i32} -// CHECK: aie.use_lock(%[[LOCK_1_3_7]], Release, 1) -// CHECK: aie.next_bd ^bb1 -// CHECK: ^bb5: -// CHECK: aie.end -// CHECK: } -// CHECK: } -module @singleFifo { - aie.device(npu1_4col) { - %tile12 = aie.tile(1, 2) - %tile13 = aie.tile(1, 3) - aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @objfifo} - aie.objectfifo @objfifo (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo> - func.func @some_work(%line_in:memref<16xi32>) -> () { - return - } - %core12 = aie.core(%tile12) { - // this acquires 2 elements - %subview0 = aie.objectfifo.acquire @objfifo (Produce, 3) : !aie.objectfifosubview> - %elem00 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem01 = aie.objectfifo.subview.access %subview0[1] : !aie.objectfifosubview> -> memref<16xi32> - %elem02 = aie.objectfifo.subview.access %subview0[2] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem00) : (memref<16xi32>) -> () - func.call @some_work(%elem01) : (memref<16xi32>) -> () - func.call @some_work(%elem02) : (memref<16xi32>) -> () - // this should only acquire one new element, previous two are still acquired - %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview> - %elem10 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem10) : (memref<16xi32>) -> () - // one new acquire should take place - aie.objectfifo.release @objfifo (Produce, 3) - aie.objectfifo.release @objfifo (Produce, 1) - %subview2 = aie.objectfifo.acquire @objfifo (Produce, 2) : !aie.objectfifosubview> - %elem20 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem21 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview> -> memref<16xi32> - func.call @some_work(%elem20) : (memref<16xi32>) -> () - func.call @some_work(%elem21) : (memref<16xi32>) -> () - // no new acquires should take place, elem30 should be third element of objFifo (with index 2) - %subview3 = aie.objectfifo.acquire @objfifo (Produce, 2) : !aie.objectfifosubview> - %elem30 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview> -> memref<16xi32> - %elem31 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview> -> memref<16xi32> - //%elem32 = aie.subview.access %subview3[2] : !aie.subview> -> memref<16xi32> // expected to fail if this line is uncommented - func.call @some_work(%elem30) : (memref<16xi32>) -> () - func.call @some_work(%elem31) : (memref<16xi32>) -> () - aie.end - } - } -} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td index 5a63a7a2a..651384926 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td @@ -21,6 +21,17 @@ def AMDAIE_CopyOpOperateOn: I32EnumAttr<"CopyOpOperateOn", { } +def AMDAIE_LockAction: I32EnumAttr<"LockAction", + "The action to be performed on a lock", + [ + I32EnumAttrCase<"Acquire", 0>, + I32EnumAttrCase<"AcquireGreaterOrEqual", 1>, + I32EnumAttrCase<"Release", 2> + ] + > { + let cppNamespace = "mlir::iree_compiler::AMDAIE"; +} + def LogicalObjectFifoPort: I32EnumAttr<"LogicalObjectFifoPort", "The logical objectfifo ports.", [ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index da869deed..78d4d05ed 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -50,6 +50,12 @@ void ChannelOp::getAsmResultNames( setNameFn(getResult(), "channel"); } +TileOp ChannelOp::getTileOp() { + auto res = dyn_cast_if_present(getTile().getDefiningOp()); + assert(res && "`amdaie.channel` expects an `amdaie.tile` as tile operand"); + return res; +} + //===----------------------------------------------------------------------===// // AMDAIE_ControlCodeOp //===----------------------------------------------------------------------===// @@ -101,7 +107,9 @@ LogicalResult CoreOp::verify() { } TileOp CoreOp::getTileOp() { - return dyn_cast_if_present(getTile().getDefiningOp()); + auto res = dyn_cast_if_present(getTile().getDefiningOp()); + assert(res && "`amdaie.core` expects an `amdaie.tile` as tile operand"); + return res; } //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 7bdfabb7e..271b596b3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -229,6 +229,40 @@ def AMDAIE_LockOp: AMDAIE_Op<"lock", [ let assemblyFormat = [{ `(` $tile `(` $value `)` (`,` $init_value^)? `)` attr-dict }]; } +def AMDAIE_UseLockOp: AMDAIE_Op<"use_lock"> { + let summary = "Represents the use of a semaphore lock with a specified " + "action (acquire/release)."; + let description = [{ + This operation represents the use of a semaphore lock with a specified lock + `action` and `value`. The lock action could for example be `Acquire`, + `AcquireGreaterOrEqual` or `Release`. The specified `value` argument + determines the value to be used in the lock action, for example: + - `Acquire(1)`: Acquire the lock if its value is equal to 1, then subtract 1 + from it. + - `AcquireGreaterOrEqual(1)`: Acquire the lock if its value is greater or + equal to 1, then subtract 1 from it. + - `Release(1)`: Add 1 to the value of this lock. + + + Example: + + ```mlir + %lock = amdaie.lock(%tile, %c0) + %0 = amdaie.use_lock(%lock, 0) + ``` + }]; + + let arguments = ( + ins Index:$lock, + AMDAIE_LockAction:$action, + I8Attr:$value + ); + + let assemblyFormat = [{ + `(` $lock `,` $action `(` $value `)` `)` attr-dict + }]; +} + //===----------------------------------------------------------------------===// // IREE AMDAIE DMA Utility Ops //===----------------------------------------------------------------------===// @@ -294,6 +328,10 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ ConfinedAttr]>:$value ); + let extraClassDeclaration = [{ + TileOp getTileOp(); + }]; + let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }]; } @@ -876,7 +914,7 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp return cast(getOutput().getType()) .getElementType(); } - + // Return the encapsulated buffers on the requested tile. llvm::SmallVector getBuffersOnTile(TileOp tileOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 50d72b077..39c044d59 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -27,11 +27,9 @@ struct AMDAIESession AMDAIE::registerAMDAIEPasses(); AMDAIE::registerAMDAIEAssignBufferAddressesBasic(); AMDAIE::registerAMDAIEAssignBufferDescriptorIDs(); - AMDAIE::registerAMDAIEAssignLockIDs(); AMDAIE::registerAMDAIECoreToStandard(); AMDAIE::registerAMDAIELocalizeLocks(); AMDAIE::registerAMDAIENormalizeAddressSpaces(); - AMDAIE::registerAMDAIEObjectFifoStatefulTransform(); AMDAIE::registerAMDAIERoutePathfinderFlows(); AMDAIE::registerAMDAIEDmaToNpu(); AMDAIE::registerAIRConversionPasses(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp new file mode 100644 index 000000000..652cb2efb --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp @@ -0,0 +1,234 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/MathExtras.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" + +#define DEBUG_TYPE "iree-amdaie-acquire-release-to-use-lock" + +namespace mlir::iree_compiler::AMDAIE { + +template +FailureOr getLogicalObjFifoOperatedOn( + T op) { + auto copyOp = + dyn_cast_if_present(op.getDma().getDefiningOp()); + if (!copyOp) + return op.emitOpError() << "should operate on a copy-like operation"; + auto logicalObjFifo = + op.getPort() == LogicalObjectFifoPort::Consume + ? dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()) + : dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + if (!logicalObjFifo) { + return copyOp.emitOpError() + << "should operate on an `amdaie.logicalobjectfifo.from_buffers` op"; + } + return logicalObjFifo; +} + +/// Unroll the scf.for loops inside the core operations based on the depths of +/// the acquired objFifos. +LogicalResult coreLoopUnroll(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { + WalkResult res = coreOp.walk([&](scf::ForOp forOp) { + llvm::SmallDenseSet depths; + for (auto acqOp : + forOp.getBody()->getOps()) { + FailureOr maybeLogicalObjFifo = + getLogicalObjFifoOperatedOn(acqOp); + if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt(); + AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo = + maybeLogicalObjFifo.value(); + depths.insert(logicalObjFifo.getDepth()); + } + int unrollFactor = + std::accumulate(depths.begin(), depths.end(), 1, std::lcm); + if (unrollFactor > 1 && + failed(mlir::loopUnrollByFactor(forOp, unrollFactor))) { + forOp.emitOpError() << "could not be unrolled with unrollFactor: " + << unrollFactor << "\n"; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +FailureOr getLockToBeUsed( + AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo, + AMDAIE::TileOp tileOp, LogicalObjectFifoPort port, LockAction lockAction) { + // Retrieve the lock to be used based on the port and lock action. + SmallVector consumerLocks = + logicalObjFifo.getConsumerLocksOnTile(tileOp); + if (consumerLocks.size() != 1) { + return logicalObjFifo.emitOpError() + << "expected a single consumer lock for tile: " + << tileOp.getResult(); + } + SmallVector producerLocks = + logicalObjFifo.getProducerLocksOnTile(tileOp); + if (producerLocks.size() != 1) { + return logicalObjFifo.emitOpError() + << "expected a single producer lock for tile: " + << tileOp.getResult(); + } + AMDAIE::LockOp lockOp; + if (lockAction == LockAction::Acquire || + lockAction == LockAction::AcquireGreaterOrEqual) { + lockOp = port == LogicalObjectFifoPort::Consume ? consumerLocks[0] + : producerLocks[0]; + } else if (lockAction == LockAction::Release) { + lockOp = port == LogicalObjectFifoPort::Consume ? producerLocks[0] + : consumerLocks[0]; + } else { + return logicalObjFifo.emitOpError() + << "used in unsupported lock action: " << stringifyEnum(lockAction); + } + return lockOp; +} + +LogicalResult acquireToUseLock(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { + OpBuilder::InsertionGuard g(rewriter); + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + DenseMap + logicalObjFifoToIndex; + SmallVector toBeErased; + WalkResult res = coreOp.walk([&](AMDAIE::LogicalObjectFifoAcquire acqOp) { + LLVM_DEBUG(llvm::dbgs() + << "Convert acquire op: " << acqOp.getOutput() << "\n"); + std::optional maybeAcqSize = acqOp.getSize(); + assert(maybeAcqSize && maybeAcqSize.value() == 1 && + "logic currently only handles size set and equal to 1"); + int acqSize = maybeAcqSize.value(); + + FailureOr maybeLogicalObjFifo = + getLogicalObjFifoOperatedOn(acqOp); + if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt(); + AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo = + maybeLogicalObjFifo.value(); + + FailureOr maybeLockOp = + getLockToBeUsed(logicalObjFifo, tileOp, acqOp.getPort(), + LockAction::AcquireGreaterOrEqual); + if (failed(maybeLockOp)) return WalkResult::interrupt(); + + rewriter.setInsertionPoint(acqOp); + rewriter.create(acqOp.getLoc(), maybeLockOp.value(), + LockAction::AcquireGreaterOrEqual, + acqSize); + + // Rotate through buffers based on access index. + SmallVector buffers = + logicalObjFifo.getBuffersOnTile(tileOp); + if (!logicalObjFifoToIndex.contains(logicalObjFifo)) + logicalObjFifoToIndex[logicalObjFifo] = 0; + size_t bufferIndex = logicalObjFifoToIndex[logicalObjFifo] % buffers.size(); + for (Operation *userOp : acqOp->getUsers()) { + auto accessOp = dyn_cast(userOp); + if (!accessOp) { + acqOp.emitOpError() << "currently only supports " + "`amdaie.logicalobjectfifo.access` users"; + return WalkResult::interrupt(); + } + AMDAIE::BufferOp bufferOp = buffers[bufferIndex]; + accessOp.getResult().replaceAllUsesWith(bufferOp.getResult()); + toBeErased.push_back(accessOp); + } + logicalObjFifoToIndex[logicalObjFifo] += acqSize; + toBeErased.push_back(acqOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } + return success(); +} + +LogicalResult releaseToUseLock(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { + OpBuilder::InsertionGuard g(rewriter); + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + SmallVector toBeErased; + WalkResult res = coreOp.walk([&](AMDAIE::LogicalObjectFifoRelease relOp) { + LLVM_DEBUG(llvm::dbgs() << "Convert release op: " << relOp << "\n"); + std::optional maybeRelSize = relOp.getSize(); + assert(maybeRelSize && maybeRelSize.value() == 1 && + "logic currently only handles size set and equal to 1"); + int relSize = maybeRelSize.value(); + + FailureOr maybeLogicalObjFifo = + getLogicalObjFifoOperatedOn(relOp); + if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt(); + + FailureOr maybeLockOp = + getLockToBeUsed(maybeLogicalObjFifo.value(), tileOp, relOp.getPort(), + LockAction::Release); + if (failed(maybeLockOp)) return WalkResult::interrupt(); + + rewriter.setInsertionPoint(relOp); + rewriter.create(relOp.getLoc(), maybeLockOp.value(), + LockAction::Release, relSize); + toBeErased.push_back(relOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } + return success(); +} + +namespace { + +struct AMDAIEAcquireReleaseToUseLockPass + : public impl::AMDAIEAcquireReleaseToUseLockBase< + AMDAIEAcquireReleaseToUseLockPass> { + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + + WalkResult res = parentOp->walk([&](AMDAIE::CoreOp coreOp) { + // Loops need to be unrolled based on on the depths of the logical + // objectFifos so `amdaie.use_lock` ops can be inserted correctly for + // double buffering purposes, without need for a dependency on the loop + // induction variable. + if (failed(coreLoopUnroll(rewriter, coreOp))) { + return WalkResult::interrupt(); + } + if (failed(acquireToUseLock(rewriter, coreOp))) { + return WalkResult::interrupt(); + } + if (failed(releaseToUseLock(rewriter, coreOp))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); + } +}; + +} // namespace + +std::unique_ptr createAMDAIEAcquireReleaseToUseLockPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp deleted file mode 100644 index b25fe3553..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -#include "iree-amd-aie/IR/AMDAIEOps.h" -#include "iree-amd-aie/Transforms/Passes.h" -#include "iree-amd-aie/Transforms/Transforms.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/Support/MathExtras.h" -#include "mlir/Dialect/SCF/Transforms/Transforms.h" -#include "mlir/Dialect/SCF/Utils/Utils.h" - -#define DEBUG_TYPE "iree-amdaie-core-loop-unroll" - -namespace mlir::iree_compiler::AMDAIE { - -/// Unroll the scf.for loops inside the core operations based on the depths of -/// the acquired objFifos. -LogicalResult coreLoopUnroll(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) { - WalkResult res = coreOp.walk([&](scf::ForOp forOp) { - llvm::SmallDenseSet depths; - for (auto acqOp : - forOp.getBody()->getOps()) { - auto copyOp = - dyn_cast_if_present(acqOp.getDma().getDefiningOp()); - if (!copyOp) { - acqOp.emitOpError() << "should operate on a copy-like operation"; - return WalkResult::interrupt(); - } - auto logicalObjFifo = - acqOp.getPort() == LogicalObjectFifoPort::Consume - ? dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()) - : dyn_cast_if_present( - copyOp.getSource().getDefiningOp()); - depths.insert(logicalObjFifo.getDepth()); - } - int unrollFactor = - std::accumulate(depths.begin(), depths.end(), 1, std::lcm); - if (unrollFactor > 1 && - failed(mlir::loopUnrollByFactor(forOp, unrollFactor))) { - forOp.emitOpError() << "could not be unrolled with unrollFactor: " - << unrollFactor << "\n"; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - -namespace { - -struct AMDAIECoreLoopUnrollPass - : public impl::AMDAIECoreLoopUnrollBase { - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - Operation *parentOp = getOperation(); - IRRewriter rewriter(parentOp->getContext()); - - WalkResult res = parentOp->walk([&](AMDAIE::CoreOp coreOp) { - if (failed(coreLoopUnroll(rewriter, coreOp))) { - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return signalPassFailure(); - } -}; - -} // namespace - -std::unique_ptr createAMDAIECoreLoopUnrollPass() { - return std::make_unique(); -} - -} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 1736f0879..ad04e6ecf 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -11,13 +11,17 @@ // //===----------------------------------------------------------------------===// +#include "AMDAIELowerToAIE.h" + #include #include #include "aie/AIEDialect.h" #include "aie/AIEXDialect.h" +#include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" @@ -33,55 +37,38 @@ using namespace xilinx; namespace mlir::iree_compiler::AMDAIE { -namespace { - -/// Utility to remap the provided operation's operands. -void remapOperands(Operation *op, IRMapping &mapper) { - for (int i = 0; i < op->getNumOperands(); ++i) { - Value operand = op->getOperand(i); - if (mapper.contains(operand)) { - op->setOperand(i, mapper.lookup(operand)); - } - } -} - -/// It is dangerous to erase ops with `rewriter` without erasing them from -/// `mapper` too, as addresses of Operations/Values can be reused, resulting in -/// unexpected key-value pairs in `mapper`. Use this utility if `mapper` might -/// be used after `op` is erased. -void eraseOp(IRRewriter &rewriter, IRMapping &mapper, Operation *op) { - for (Value result : op->getResults()) { - mapper.erase(result); - } - mapper.erase(op); - op->dropAllUses(); - rewriter.eraseOp(op); -} - //===----------------------------------------------------------------------===// -// Convert amdaie.core operation to aie.core +// AIEDeviceBuilder utilities //===----------------------------------------------------------------------===// -/// Utility to convert vectors of `size` and `stride` into an -/// `AIE::BDDimLayoutArrayAttr`. -AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr( - IRRewriter &rewriter, const SmallVector &sizes, +AIE::BDDimLayoutArrayAttr +AIEDeviceBuilder::convertSizeStrideToBDDimLayoutArrayAttr( + const SmallVector &sizes, const SmallVector &strides) { assert(sizes.size() == strides.size() && "expected stride and size vectors of same size"); + // Fold remaining dimensions, assuming zero offsets as offsets should be taken + // care of separately. + SmallVector offsets( + strides.size(), getAsIndexOpFoldResult(rewriter.getContext(), 0)); + SmallVector newOffsets; + SmallVector newSizes; + SmallVector newStrides; + foldDims(offsets, sizes, strides, newOffsets, newSizes, newStrides); + SmallVector bdDimLayoutAttr; // If the access pattern (strides/sizes) have a single dimension, make it // implicit with an empty `BDDimLayoutAttr` as this is what the AIE dialect // expects. - if (strides.size() == 1) { - std::optional stride = getConstantIntValue(strides[0]); + if (newStrides.size() == 1) { + std::optional stride = getConstantIntValue(newStrides[0]); if (stride && stride.value() == 1) { return AIE::BDDimLayoutArrayAttr::get(rewriter.getContext(), ArrayRef(bdDimLayoutAttr)); } } - bdDimLayoutAttr.reserve(sizes.size()); - for (auto [size, stride] : llvm::zip(sizes, strides)) { + bdDimLayoutAttr.reserve(newSizes.size()); + for (auto [size, stride] : llvm::zip(newSizes, newStrides)) { bdDimLayoutAttr.push_back(AIE::BDDimLayoutAttr::get( rewriter.getContext(), getConstantIntValue(size).value(), getConstantIntValue(stride).value())); @@ -90,226 +77,128 @@ AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr( ArrayRef(bdDimLayoutAttr)); } -/// Utility to create an `aie.objectfifo` operation from -/// `amdaie.circular_dma_cpy_nd`. -FailureOr createObjectFifo( - IRRewriter &rewriter, AMDAIE::ConnectionOp connectionOp, IRMapping &mapper, - AMDAIE::NpuCircularDmaCpyNdOp dmaOp, Value srcTile, ValueRange dstTiles, - StringAttr &symName) { - OpBuilder::InsertionGuard guard(rewriter); - auto sourceType = - cast(connectionOp.getSource().getType()); - auto targetType = - cast(connectionOp.getTarget().getType()); - uint8_t sourceMemSpace = sourceType.getMemorySpaceAsUInt(); - uint8_t targetMemSpace = targetType.getMemorySpaceAsUInt(); - unsigned depth; - unsigned sourceDepth = sourceType.getDepth(); - unsigned targetDepth = targetType.getDepth(); - if (sourceMemSpace == 0 && targetMemSpace == 0) { - return connectionOp.emitOpError() - << "both source and target on main memory not supported"; - } else if (sourceMemSpace == 0) { - depth = targetDepth; - } else if (targetMemSpace == 0) { - depth = sourceDepth; - } else { - if (sourceDepth != targetDepth) - return connectionOp.emitOpError() - << "unsupported sourceDepth != targetDepth"; - depth = sourceDepth; - } - - SmallVector producerChannels; - SmallVector consumerChannels; - for (Value producerChannel : connectionOp.getSourceChannels()) { - auto channelOp = - dyn_cast(producerChannel.getDefiningOp()); - if (!channelOp) { - return connectionOp.emitOpError() - << "found non-`amdaie.channel` source channel"; +/// Create a new `aie.dma_start` op with a sequence of DMA BD blocks within the +/// provided `memOp`. +/// +/// Example of a S2MM DMA start op being created with two DMA blocks performing +/// a circular double buffering DMA operation: +/// +/// %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { +/// %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +/// ^bb1: // 2 preds: ^bb0, ^bb2 +/// aie.use_lock(%lock_0_1_51, AcquireGreaterEqual, 2) +/// aie.dma_bd(%buffer_0_1_49 : memref<2048xi32, 1 : i32>) {len = 2048 : i32} +/// aie.use_lock(%lock_0_1_52, Release, 2) +/// aie.next_bd ^bb2 +/// ^bb2: // pred: ^bb1 +/// aie.use_lock(%lock_0_1_51, AcquireGreaterEqual, 2) +/// aie.dma_bd(%buffer_0_1_50 : memref<2048xi32, 1 : i32>) {len = 2048 : i32} +/// aie.use_lock(%lock_0_1_52, Release, 2) +/// aie.next_bd ^bb1 +void AIEDeviceBuilder::createDMA( + Operation *memOp, AIE::DMAChannelDir channelDir, int channelIndex, + AIE::BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum, int64_t len, + int64_t offset, const SmallVector &bufferOps, + const std::pair &locks) { + OpBuilder::InsertionGuard g(rewriter); + Block &endBlock = memOp->getRegion(0).getBlocks().back(); + assert(!endBlock.getOps().empty() && + "expected last block to have aie.end"); + Block *lastDmaBlock = endBlock.getSinglePredecessor(), + *dmaBlock = rewriter.createBlock(&endBlock), + *bdBlock = rewriter.createBlock(&endBlock); + + // Create DMA channel. + rewriter.setInsertionPointToStart(dmaBlock); + rewriter.create(rewriter.getUnknownLoc(), channelDir, + channelIndex, /*repeatCount*/ 0, bdBlock, + &endBlock); + if (lastDmaBlock) lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1); + + auto createBdBlockOps = [&](AIE::BufferOp buff, Block *succ) { + AIE::LockOp acqLock = locks.first, relLock = locks.second; + rewriter.create(rewriter.getUnknownLoc(), acqLock, + AIE::LockAction::AcquireGreaterEqual, + acqNum); + if (!dims.getValue().empty()) { + rewriter.create(rewriter.getUnknownLoc(), buff, offset, len, + dims); + } else { + rewriter.create(rewriter.getUnknownLoc(), buff, offset, + len); } - producerChannels.push_back(channelOp); - } - for (Value consumerChannel : connectionOp.getTargetChannels()) { - auto channelOp = - dyn_cast(consumerChannel.getDefiningOp()); - if (!channelOp) { - return connectionOp.emitOpError() - << "found non-`amdaie.channel` source channel"; + rewriter.create(rewriter.getUnknownLoc(), relLock, + AIE::LockAction::Release, relNum); + rewriter.create(rewriter.getUnknownLoc(), succ); + }; + + // Create Bd blocks. + Block *succ = nullptr, *curr = bdBlock; + for (size_t blockIndex = 0; blockIndex < bufferOps.size(); ++blockIndex) { + if (blockIndex == bufferOps.size() - 1) { + succ = bdBlock; + } else { + succ = rewriter.createBlock(&endBlock); } - consumerChannels.push_back(channelOp); + rewriter.setInsertionPointToStart(curr); + createBdBlockOps(bufferOps[blockIndex], succ); + curr = succ; } +} - // Convert source and target sizes and strides to `BDDimLayoutArrayAttr`s, - // which the `aie.objectfifo` works with. - AIE::BDDimLayoutArrayAttr sourceDims = - convertSizeStrideToBDDimLayoutArrayAttr( - rewriter, dmaOp.getSourceMixedSizes(), dmaOp.getSourceMixedStrides()); - - AIE::BDDimLayoutArrayAttr layoutAttr = - convertSizeStrideToBDDimLayoutArrayAttr( - rewriter, dmaOp.getTargetMixedSizes(), dmaOp.getTargetMixedStrides()); - // The aie.objectfifo expects a `BDDimLayoutArrayAttr` for each consumer. A - // single one for all consumers will error out. - SmallVector targetDimsVec(dstTiles.size(), - layoutAttr); - - AIE::BDDimLayoutArrayArrayAttr targetDims = - AIE::BDDimLayoutArrayArrayAttr::get(rewriter.getContext(), - ArrayRef(targetDimsVec)); - - // For now, set data type based on source and target memory space. Use - // L2/MemTile type if either source or target is located on L2. Otherwise, use - // the most local type. - // TODO(jornt): Not very clear and clean, but this is to mimic how AIE - // objectfifos are set up and it is probably better to adjust AIE objectfifos - // directly to make this more clean. - // TODO(jornt): I think objectfifos should support source type != dest type. - MemRefType srcType = cast(connectionOp.getSourceType()) - .getElementType(); - MemRefType dstType = cast(connectionOp.getTargetType()) - .getElementType(); - ArrayRef sourceShape = srcType.getShape(); - ArrayRef targetShape = dstType.getShape(); - int64_t sourceSize = std::accumulate(sourceShape.begin(), sourceShape.end(), - 1, std::multiplies<>()); - int64_t targetSize = std::accumulate(targetShape.begin(), targetShape.end(), - 1, std::multiplies<>()); - MemRefType memrefType = - sourceSize < targetSize - ? MemRefType::get({sourceSize}, srcType.getElementType(), - MemRefLayoutAttrInterface{}, - srcType.getMemorySpace()) - : MemRefType::get({targetSize}, dstType.getElementType(), - MemRefLayoutAttrInterface{}, - dstType.getMemorySpace()); - AIE::AIEObjectFifoType dtype = AIE::AIEObjectFifoType::get(memrefType); - auto fifo = rewriter.create( - rewriter.getUnknownLoc(), symName, srcTile, dstTiles, - rewriter.getIntegerAttr(rewriter.getI32Type(), depth), dtype, sourceDims, - targetDims); - - // Insert flow ops - rewriter.setInsertionPoint(fifo); - for (AMDAIE::ChannelOp producerChannel : producerChannels) { - for (AMDAIE::ChannelOp consumerChannel : consumerChannels) { - Value aieProducerTile = mapper.lookup(producerChannel.getTile()); - Value aieConsumerTile = mapper.lookup(consumerChannel.getTile()); - rewriter.create( - rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA, - producerChannel.getValue(), aieConsumerTile, AIE::WireBundle::DMA, - consumerChannel.getValue(), FlatSymbolRefAttr::get(fifo->getContext(), fifo.getName())); - } - } +AIE::ShimDMAAllocationOp AIEDeviceBuilder::createShimDmaAllocation( + Block *deviceBlock, AMDAIE::TileOp tileOp, AIE::DMAChannelDir dmaChannelDir, + uint8_t channel, MemRefType memrefType, int &connectionIndex) { + OpBuilder::InsertionGuard g(rewriter); + auto shimDmaAllocOp = rewriter.create( + rewriter.getUnknownLoc(), "shim_" + std::to_string(connectionIndex++), + dmaChannelDir, channel, getConstantIndexOrAssert(tileOp.getCol())); + rewriter.setInsertionPointToStart(deviceBlock); + StringRef symName = shimDmaAllocOp.getSymName(); + rewriter.create(rewriter.getUnknownLoc(), symName, + rewriter.getStringAttr("public"), + memrefType, nullptr, false, nullptr); + return shimDmaAllocOp; +} - return fifo; +void AIEDeviceBuilder::eraseOp(Operation *op) { + for (Value result : op->getResults()) mapper.erase(result); + mapper.erase(op); + op->dropAllUses(); + rewriter.eraseOp(op); } -/// Convert `amdaie.logicalobjectfifo.access` to -/// `aie.objectfifo.subview.access`, and refactor the memory space for -/// `memref.reinterpret_cast` ops. -LogicalResult accessOpToAIE(IRRewriter &rewriter, - AMDAIE::LogicalObjectFifoAccessOp accessOp, - IRMapping &mapper, - SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAccessOp]\n"); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(accessOp); - if (!mapper.contains(accessOp.getInput())) { - return accessOp.emitError() - << "this access operation's input has not been mapped"; - } - auto subviewOp = dyn_cast_if_present( - mapper.lookup(accessOp.getInput()).getDefiningOp()); - if (!subviewOp) { - return accessOp.emitError() - << "access doesn't operate on an input that has been mapped to an " - "`aie.objectfifo.acquire` + subview operation"; - } +void AIEDeviceBuilder::foldDims(const SmallVector &offsets, + const SmallVector &sizes, + const SmallVector &strides, + SmallVector &newOffsets, + SmallVector &newSizes, + SmallVector &newStrides) { + SmallVector tmpOffsets; + SmallVector tmpSizes; + SmallVector tmpStrides; + (void)foldUnitDims(offsets, sizes, strides, tmpOffsets, tmpSizes, tmpStrides); + (void)foldLinearDims(rewriter.getContext(), tmpOffsets, tmpSizes, tmpStrides, + newOffsets, newSizes, newStrides); + (void)foldSingleDim(newOffsets, newSizes, newStrides); +} - SmallVector oldReinterpretOps; - for (Operation *user : accessOp->getUsers()) { - if (isa(user)) { - oldReinterpretOps.push_back(cast(user)); +void AIEDeviceBuilder::remapOperands(Operation *op) { + for (int i = 0; i < op->getNumOperands(); ++i) { + Value operand = op->getOperand(i); + if (mapper.contains(operand)) { + op->setOperand(i, mapper.lookup(operand)); } } - if (oldReinterpretOps.empty()) { - return accessOp.emitError() << "reinterpret-cast op has not been generated"; - } - assert(oldReinterpretOps.size() == 1 && - "expected a single reinterpret-cast op"); - auto oldReinterpretOp = oldReinterpretOps[0]; - - auto type = cast(oldReinterpretOp.getResult().getType()); - MemRefType newType = MemRefType::Builder(type); - ArrayRef sizes = newType.getShape(); - auto [strides, baseOffset] = getStridesAndOffset(newType); - auto reinterpretOp = rewriter.create( - rewriter.getUnknownLoc(), newType, subviewOp.getOutput(), baseOffset, - sizes, strides); - - mapper.map(oldReinterpretOp.getOperation(), reinterpretOp.getOperation()); - mapper.map(oldReinterpretOp.getResult(), reinterpretOp.getResult()); - toBeErased.push_back(accessOp); - toBeErased.push_back(oldReinterpretOp); - return success(); } -/// Convert `amdaie.logicalobjectfifo.acquire` to `aie.objectfifo.acquire`. -/// Also insert `aie.objectfifo.subview.access` operations to access the -/// underlying memref and bridge the gap to AIE. -LogicalResult acquireOpToAIE(IRRewriter &rewriter, - AMDAIE::LogicalObjectFifoAcquire acquireOp, - IRMapping &mapper, - SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAcquire]\n"); - - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(acquireOp); - auto connectionOp = dyn_cast_if_present( - acquireOp.getDma().getDefiningOp()); - if (!connectionOp) { - return connectionOp.emitError() - << "acquire doesn't operate on a `amdaie.connection`"; - } - - auto objFifo = dyn_cast( - mapper.lookup(connectionOp.getOperation())); - if (!objFifo) { - return acquireOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; - } - - auto acquireOpType = dyn_cast(acquireOp.getType()); - assert(acquireOpType && - "Expected LogicalObjectFifoAcquire to have type " - "LogicalObjectFifoType"); - MemRefType elementType = acquireOpType.getElementType(); - - auto subviewType = AIE::AIEObjectFifoSubviewType::get(elementType); - AIE::ObjectFifoPort port = - acquireOp.getPort() == LogicalObjectFifoPort::Produce - ? AIE::ObjectFifoPort::Produce - : AIE::ObjectFifoPort::Consume; - auto objFifoAquireOp = rewriter.create( - rewriter.getUnknownLoc(), subviewType, port, objFifo.getName(), 1); - - auto subviewOp = rewriter.create( - rewriter.getUnknownLoc(), elementType, objFifoAquireOp.getSubview(), - /* index = */ rewriter.getIntegerAttr(rewriter.getI32Type(), 0)); - - // Map acquire op to new acquire + subview op. - mapper.map(acquireOp.getOperation(), subviewOp.getOperation()); - mapper.map(acquireOp.getResult(), subviewOp.getOutput()); - toBeErased.push_back(acquireOp); - return success(); -} +//===----------------------------------------------------------------------===// +// Convert `amdaie.core` op to `aie.core` op. +//===----------------------------------------------------------------------===// -LogicalResult coreMemrefExtractStridedMetadataToAIE( - IRRewriter &rewriter, +LogicalResult AIEDeviceBuilder::coreMemrefExtractStridedMetadataToAIE( memref::ExtractStridedMetadataOp extractStridedMetadataOp, - IRMapping &mapper, SmallVector &toBeErased) { + SmallVector &toBeErased) { LLVM_DEBUG(llvm::dbgs() << "Convert [memref.extract_strided_metadata]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(extractStridedMetadataOp); @@ -325,9 +214,8 @@ LogicalResult coreMemrefExtractStridedMetadataToAIE( return success(); } -LogicalResult coreFuncCallOpToAIE(IRRewriter &rewriter, func::CallOp oldCallOp, - IRMapping &mapper, - SmallVector &toBeErased) { +LogicalResult AIEDeviceBuilder::coreFuncCallOpToAIE( + func::CallOp oldCallOp, SmallVector &toBeErased) { LLVM_DEBUG(llvm::dbgs() << "Convert [func.call / function declaration]\n"); // Form new argument(s) and function type for the func.call op. SmallVector newArgs; @@ -370,34 +258,32 @@ LogicalResult coreFuncCallOpToAIE(IRRewriter &rewriter, func::CallOp oldCallOp, return success(); } -LogicalResult coreReleaseOpToAIE(IRRewriter &rewriter, - AMDAIE::LogicalObjectFifoRelease releaseOp, - IRMapping &mapper, - SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoRelease]\n"); +LogicalResult AIEDeviceBuilder::coreUseLockToAIE( + AMDAIE::UseLockOp useLockOp, SmallVector &toBeErased) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::UseLockOp]\n"); OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(releaseOp); - Operation *dmaOp = releaseOp.getDma().getDefiningOp(); - auto objFifo = dyn_cast(mapper.lookup(dmaOp)); - if (!objFifo) { - return releaseOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; + AIE::LockAction lockAction; + if (useLockOp.getAction() == AMDAIE::LockAction::AcquireGreaterOrEqual) { + lockAction = AIE::LockAction::AcquireGreaterEqual; + } else if (useLockOp.getAction() == AMDAIE::LockAction::Acquire) { + lockAction = AIE::LockAction::Acquire; + } else if (useLockOp.getAction() == AMDAIE::LockAction::Release) { + lockAction = AIE::LockAction::Release; + } else { + useLockOp.emitOpError() << "unsupported lock action in lowering to AIE: " + << stringifyEnum(useLockOp.getAction()); } - AIE::ObjectFifoPort port = - releaseOp.getPort() == LogicalObjectFifoPort::Produce - ? AIE::ObjectFifoPort::Produce - : AIE::ObjectFifoPort::Consume; - std::optional maybeSize = releaseOp.getSize(); - unsigned size = maybeSize ? maybeSize.value() : 1; - rewriter.replaceOpWithNewOp( - releaseOp, port, objFifo.getName(), size); + Value aieLock = mapper.lookup(useLockOp.getLock()); + rewriter.create(useLockOp.getLoc(), aieLock, lockAction, + useLockOp.getValue()); + toBeErased.push_back(useLockOp); return success(); } /// Convert `amdaie.core` into `aie.core`. -LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, - IRMapping &mapper, AIE::DeviceOp deviceOp, - Block *deviceCoreBlock) { +LogicalResult AIEDeviceBuilder::coreToAIE(AMDAIE::CoreOp coreOp, + AIE::DeviceOp deviceOp, + Block *deviceCoreBlock) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CoreOp]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointToEnd(deviceCoreBlock); @@ -429,27 +315,19 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, WalkResult walkResult = aieCoreOp.walk([&](Operation *op) { rewriter.setInsertionPoint(op); if (TypeSwitch(op) - .Case([&](auto accessOp) { - return accessOpToAIE(rewriter, accessOp, mapper, toBeErased); - }) - .Case([&](auto acquireOp) { - return acquireOpToAIE(rewriter, acquireOp, mapper, toBeErased); - }) - .Case([&](auto releaseOp) { - return coreReleaseOpToAIE(rewriter, releaseOp, mapper, - toBeErased); - }) .Case( [&](auto extractStridedMetadataOp) { return coreMemrefExtractStridedMetadataToAIE( - rewriter, extractStridedMetadataOp, mapper, toBeErased); + extractStridedMetadataOp, toBeErased); }) .Case([&](auto oldCallOp) { - return coreFuncCallOpToAIE(rewriter, oldCallOp, mapper, - toBeErased); + return coreFuncCallOpToAIE(oldCallOp, toBeErased); + }) + .Case([&](auto useLockOp) { + return coreUseLockToAIE(useLockOp, toBeErased); }) .Default([&](Operation *op) { - remapOperands(op, mapper); + remapOperands(op); return success(); }) .failed()) { @@ -461,84 +339,28 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, coreOp.emitError("could not convert to AIEDialect ops"); return failure(); } - for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op); + for (Operation *op : toBeErased) eraseOp(op); mapper.map(coreOp.getResult(), aieCoreOp.getResult()); mapper.map(coreOp.getOperation(), aieCoreOp.getOperation()); return success(); } -} // namespace - -//===----------------------------------------------------------------------===// -// Convert amdaie.circular_dma_cpy_nd operation to aie.objectfifo -//===----------------------------------------------------------------------===// - -/// Convert the `amdaie.connection` operation into bidirectional object -/// fifos. -LogicalResult flowToAIE(IRRewriter &rewriter, AMDAIE::ConnectionOp connectionOp, - IRMapping &mapper, Block *deviceBlock, int &dmaId) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n"); - rewriter.setInsertionPointToEnd(deviceBlock); - if (!connectionOp.getSource()) - return connectionOp.emitOpError() << "expected a source"; - auto sourceLogicalObjFifo = - dyn_cast_if_present( - connectionOp.getSource().getDefiningOp()); - if (!sourceLogicalObjFifo) - return connectionOp.emitOpError() << "expected a logical objectFifo source"; - SmallVector newSourceTiles = - llvm::map_to_vector(sourceLogicalObjFifo.getTiles(), - [&](Value tile) { return mapper.lookup(tile); }); - if (newSourceTiles.size() != 1) { - return connectionOp.emitError() - << "Can't create an `aie.objectfifo` from this flow operation as " - "`ObjectFifoCreateOp` only handles a single source tile for now, " - "but got: "; - } - Value newSourceTile = newSourceTiles[0]; - - if (!connectionOp.getTarget()) - return connectionOp.emitOpError() << "expected a source"; - auto targetLogicalObjFifo = - dyn_cast_if_present( - connectionOp.getTarget().getDefiningOp()); - if (!targetLogicalObjFifo) - return connectionOp.emitOpError() << "expected a logical objectFifo source"; - SmallVector newTargetTiles = - llvm::map_to_vector(targetLogicalObjFifo.getTiles(), - [&](Value tile) { return mapper.lookup(tile); }); - - FailureOr npuDmaUserOp = - connectionOp.getNpuCircularDmaCpyNdUser(); - if (failed(npuDmaUserOp)) return failure(); - - auto symName = "obj" + std::to_string(dmaId++); - StringAttr symAttr = rewriter.getStringAttr(symName); - FailureOr objFifo = - createObjectFifo(rewriter, connectionOp, mapper, npuDmaUserOp.value(), - newSourceTile, newTargetTiles, symAttr); - if (failed(objFifo)) return failure(); - mapper.map(connectionOp.getOperation(), objFifo.value().getOperation()); - return success(); -} - //===----------------------------------------------------------------------===// // Convert amdaie.controlcode operation to NPU instruction func //===----------------------------------------------------------------------===// /// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`. -LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, - AMDAIE::NpuDmaCpyNdOp dmaOp, - SmallVector &toBeErased, - IRMapping &mapper, IRMapping &bindingsMapper) { +LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE( + AMDAIE::NpuDmaCpyNdOp dmaOp, SmallVector &toBeErased) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaCpyNdOp]\n"); AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp(); SmallVector offsets, sizes, strides; ArrayRef staticOffsets, staticSizes, staticStrides; AMDAIE::BdIdOp bdIdOp; LogicalObjectFifoFromMemrefOp logicalObjFifo; - + SmallVector memOps; // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. if (dmaOp.getSource()) { offsets = dmaOp.getSourceOffsets(); @@ -558,9 +380,8 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, return dmaOp.emitOpError() << "expected source to be an " "`amdaie.logicalobjectfifo.from_memref`"; } - } - - else if (dmaOp.getTarget()) { + memOps = connectionToSourceTargetMemOps[connectionOp].first; + } else if (dmaOp.getTarget()) { offsets = dmaOp.getTargetOffsets(); sizes = dmaOp.getTargetSizes(); strides = dmaOp.getTargetStrides(); @@ -578,23 +399,21 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, return dmaOp.emitOpError() << "expected target to be an " "`amdaie.logicalobjectfifo.from_memref`"; } - } - - else { + memOps = connectionToSourceTargetMemOps[connectionOp].second; + } else { return dmaOp.emitOpError() << "has neither source not target memory space as L3."; } Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref()); - auto objFifo = dyn_cast( - mapper.lookup(connectionOp.getOperation())); - - uint32_t bdId = bdIdOp.getValue(); - - if (!objFifo) { - return dmaOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; + if (memOps.size() != 1) { + return dmaOp.emitOpError() << "only a single connection op source expected"; + } + auto shimDmaAllocOp = dyn_cast(memOps[0]); + if (!shimDmaAllocOp) { + return dmaOp.emitOpError() << "expected the source of the connection to " + "be mapped to a `AIE::ShimDMAAllocationOp`"; } if (!offsets.empty() || !sizes.empty() || !strides.empty()) { @@ -607,41 +426,52 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, "aiex.npu.dma_memcpy_nd."; } + uint32_t bdId = bdIdOp.getValue(); bool issueToken = dmaOp.hasDmaWaitOpUser(); rewriter.setInsertionPoint(dmaOp); rewriter.create( dmaOp.getLoc(), SmallVector{}, 0, 0, memref, offsets, sizes, strides, staticOffsets, staticSizes, staticStrides, nullptr, - objFifo.getName(), bdId, issueToken); + shimDmaAllocOp.getSymName(), bdId, issueToken); toBeErased.push_back(dmaOp); return success(); } /// Convert the `amdaie.npu.dma_wait` operation to `aiex.npu.dma_wait`. -LogicalResult npuDmaWaitToAIE(IRRewriter &rewriter, AMDAIE::NpuDmaWaitOp waitOp, - SmallVector &toBeErased, - IRMapping &mapper, IRMapping &bindingsMapper) { +LogicalResult AIEDeviceBuilder::npuDmaWaitToAIE( + AMDAIE::NpuDmaWaitOp waitOp, SmallVector &toBeErased) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaWaitOp]\n"); rewriter.setInsertionPoint(waitOp); AMDAIE::ConnectionOp connectionOp = waitOp.getDmaOp().getConnectionOp(); - auto objFifo = dyn_cast( - mapper.lookup(connectionOp.getOperation())); - if (!objFifo) { - return waitOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; + if (!connectionToSourceTargetMemOps.contains(connectionOp)) { + return connectionOp.emitOpError() + << "should be found in the connection to source/target mem ops map"; + } + SmallVector memOps = + waitOp.getDirection() == AMDAIE::DMAChannelDir::MM2S + ? connectionToSourceTargetMemOps[connectionOp].first + : connectionToSourceTargetMemOps[connectionOp].second; + if (memOps.size() != 1) { + return waitOp.emitOpError() + << "only a single connection op source expected"; + } + auto shimDmaAllocOp = dyn_cast(memOps[0]); + if (!shimDmaAllocOp) { + return waitOp.emitOpError() << "expected the source of the connection to " + "be mapped to a `AIE::ShimDMAAllocationOp`"; } rewriter.create(rewriter.getUnknownLoc(), - objFifo.getName()); + shimDmaAllocOp.getSymName()); toBeErased.push_back(waitOp); return success(); } /// Insert the control code operations into the NPU instruction function. -LogicalResult controlCodeToAie(IRRewriter &rewriter, - AMDAIE::ControlCodeOp controlCodeOp, - xilinx::AIEX::RuntimeSequenceOp funcOp, - IRMapping &mapper, IRMapping &bindingsMapper) { +LogicalResult AIEDeviceBuilder::controlCodeToAIE( + AMDAIE::ControlCodeOp controlCodeOp, + xilinx::AIEX::RuntimeSequenceOp funcOp) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n"); Block *funcBlock = &funcOp.getBody().front(); rewriter.setInsertionPointToEnd(funcBlock); @@ -667,23 +497,21 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter, // TODO(jornt): This is temporarily handled already by // combining with `ConnectionOp` to create `aie.objectfifo` // until we get rid of those. - eraseOp(rewriter, mapper, dmaOp); + eraseOp(dmaOp); return success(); }) .Case([&](auto dmaOp) { - return npuDmaCpyNdOpToAIE(rewriter, dmaOp, toBeErased, mapper, - bindingsMapper); + return npuDmaCpyNdOpToAIE(dmaOp, toBeErased); }) .Case([&](auto waitOp) { - return npuDmaWaitToAIE(rewriter, waitOp, toBeErased, mapper, - bindingsMapper); + return npuDmaWaitToAIE(waitOp, toBeErased); }) .Case([&](auto endOp) { - eraseOp(rewriter, mapper, endOp); + eraseOp(endOp); return success(); }) .Default([&](Operation *op) { - remapOperands(op, mapper); + remapOperands(op); return success(); }) .failed()) { @@ -692,55 +520,339 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter, return WalkResult::advance(); }); if (res.wasInterrupted()) return failure(); - for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op); + for (Operation *op : toBeErased) eraseOp(op); return success(); } //===----------------------------------------------------------------------===// -// Convert amdaie.logicalobjectfifo.link operation to `aie.objectfifo.link` +// Convert ops in Workgroup to AIE ops //===----------------------------------------------------------------------===// -LogicalResult linkToAIE(IRRewriter &rewriter, - AMDAIE::LogicalObjectFifoLink linkOp, IRMapping &mapper, - Block *deviceBlock) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoLink]\n"); +/// Convert `amdaie.buffer` to `aie.buffer`. +LogicalResult AIEDeviceBuilder::bufferToAIE(AMDAIE::BufferOp bufferOp, + Block *deviceBlock, int &bufferId) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::BufferOp]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointToEnd(deviceBlock); - SmallVector inSyms; - for (auto in : linkOp.getIns()) { - auto objFifo = dyn_cast( - mapper.lookup(in.getDefiningOp())); - if (!objFifo) { - return linkOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; + auto elemType = cast(bufferOp.getType()); + Value tile = mapper.lookup(bufferOp.getTile()); + auto aieBufferOp = rewriter.create( + bufferOp.getLoc(), elemType, tile, + rewriter.getStringAttr("buff_" + std::to_string(bufferId++)), + /*address*/ bufferOp.getAddressAttr(), + /*mem_bank*/ nullptr); + mapper.map(bufferOp.getResult(), aieBufferOp.getResult()); + mapper.map(bufferOp.getOperation(), aieBufferOp.getOperation()); + return success(); +} + +/// Convert the `amdaie.connection` operation into `aie.flow` ops and DMA +/// operations. Depending on the location of the source/target of the +/// connection, different DMA ops are created: +/// 1. Source/target on a Shim tile: iterate through producer/consumer channels +/// and create corresponding `aie.shim_dma_allocation` ops. +/// 2. Source/target on MemTile: iterate through producer/consumer channels, +/// lookup the correct `aie.memtile_dma` op and create new DMA BD blocks inside. +/// 3. Source/target on MemTile: iterate through producer/consumer channels, +/// lookup the correct `aie.mem` op and create new DMA BD blocks inside. +LogicalResult AIEDeviceBuilder::connectionToAIE( + AMDAIE::ConnectionOp connectionOp, Block *deviceBlock, + int &connectionIndex) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n"); + rewriter.setInsertionPointToEnd(deviceBlock); + SmallVector producerChannels; + SmallVector consumerChannels; + for (Value producerChannel : connectionOp.getSourceChannels()) { + auto channelOp = + dyn_cast(producerChannel.getDefiningOp()); + if (!channelOp) { + return connectionOp.emitOpError() + << "found non-`amdaie.channel` source channel"; } - inSyms.push_back( - SymbolRefAttr::get(rewriter.getContext(), objFifo.getSymName())); + producerChannels.push_back(channelOp); } - SmallVector outSyms; - for (auto out : linkOp.getOuts()) { - auto objFifo = dyn_cast( - mapper.lookup(out.getDefiningOp())); - if (!objFifo) { - return linkOp.emitError() - << "output isn't mapped to an `aie.objectifo` operation"; + for (Value consumerChannel : connectionOp.getTargetChannels()) { + auto channelOp = + dyn_cast(consumerChannel.getDefiningOp()); + if (!channelOp) { + return connectionOp.emitOpError() + << "found non-`amdaie.channel` target channel"; } - outSyms.push_back( - SymbolRefAttr::get(rewriter.getContext(), objFifo.getSymName())); + consumerChannels.push_back(channelOp); + } + // Insert flow ops. + rewriter.setInsertionPointToEnd(deviceBlock); + for (AMDAIE::ChannelOp producerChannel : producerChannels) { + for (AMDAIE::ChannelOp consumerChannel : consumerChannels) { + Value aieProducerTile = mapper.lookup(producerChannel.getTile()); + Value aieConsumerTile = mapper.lookup(consumerChannel.getTile()); + rewriter.create( + rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA, + producerChannel.getValue(), aieConsumerTile, AIE::WireBundle::DMA, + consumerChannel.getValue()); + } + } + + FailureOr maybeNpuDmaUserOp = + connectionOp.getNpuCircularDmaCpyNdUser(); + if (failed(maybeNpuDmaUserOp)) + return connectionOp.emitOpError() << "has no circular NPU DMA op user"; + + SmallVector sourceMemOps; + Value source = connectionOp.getSource(); + auto sourceObjFifoLikeOp = + dyn_cast_if_present( + source.getDefiningOp()); + if (!sourceObjFifoLikeOp) { + return connectionOp.emitOpError() + << "expected source to be an logical objFifo-like op"; + } + if (sourceObjFifoLikeOp.getMemorySpaceAsUInt() == 0) { + for (AMDAIE::ChannelOp channel : producerChannels) { + AIE::ShimDMAAllocationOp shimDmaAllocOp = createShimDmaAllocation( + deviceBlock, channel.getTileOp(), AIE::DMAChannelDir::MM2S, + channel.getValue(), sourceObjFifoLikeOp.getMemrefType(), + connectionIndex); + sourceMemOps.push_back(shimDmaAllocOp.getOperation()); + } + } else { + auto sourceObjFifo = + dyn_cast_if_present( + source.getDefiningOp()); + if (!sourceObjFifo) { + return connectionOp.emitOpError() + << "expected source to be an " + "`amdaie.logicalobjectfifo.from_buffers` op"; + } + std::optional maybeSize = maybeNpuDmaUserOp->getSourceStaticSize(); + if (!maybeSize) { + return maybeNpuDmaUserOp->emitOpError() + << "could not compute a static access size for source"; + } + std::optional maybeOffset = + maybeNpuDmaUserOp->getSourceStaticBaseOffset(); + if (!maybeOffset) { + return maybeNpuDmaUserOp->emitOpError() + << "could not compute a static base offset for source"; + } + AIE::BDDimLayoutArrayAttr dims = convertSizeStrideToBDDimLayoutArrayAttr( + maybeNpuDmaUserOp->getSourceMixedSizes(), + maybeNpuDmaUserOp->getSourceMixedStrides()); + SmallVector objFifoProducers = + sourceObjFifo.getCopyLikeProducers(); + SmallVector objFifoConsumers = + sourceObjFifo.getCopyLikeConsumers(); + // Default acquire/release value is 1. Will be adjusted depending on number + // of producers/consumers. + int acqNum{1}; + if (objFifoConsumers.size() < objFifoProducers.size()) { + assert(objFifoProducers.size() % objFifoConsumers.size() == 0); + acqNum = objFifoProducers.size() / objFifoConsumers.size(); + } + for (AMDAIE::ChannelOp channel : producerChannels) { + Operation *memOp = tileToMemOpMap.at(channel.getTile()); + AMDAIE::TileOp tileOp = channel.getTileOp(); + SmallVector buffers = llvm::map_to_vector( + sourceObjFifo.getBuffersOnTile(tileOp), + [&](AMDAIE::BufferOp bufferOp) { + return cast(mapper.lookup(bufferOp.getOperation())); + }); + SmallVector producerLocks = llvm::map_to_vector( + sourceObjFifo.getProducerLocksOnTile(tileOp), + [&](AMDAIE::LockOp lockOp) { + return cast(mapper.lookup(lockOp.getOperation())); + }); + SmallVector consumerLocks = llvm::map_to_vector( + sourceObjFifo.getConsumerLocksOnTile(tileOp), + [&](AMDAIE::LockOp lockOp) { + return cast(mapper.lookup(lockOp.getOperation())); + }); + if (producerLocks.size() != 1) { + return sourceObjFifo.emitOpError() + << "expected a single producer lock for tile: " + << channel.getTile() << ", channel: " << channel.getResult(); + } + if (consumerLocks.size() != 1) { + return sourceObjFifo.emitOpError() + << "expected a single consumer lock for tile: " + << channel.getTile() << ", channel: " << channel.getResult(); + } + std::pair lockPair = + std::make_pair(consumerLocks[0], producerLocks[0]); + rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); + createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims, + acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, + lockPair); + } + } + + SmallVector targetMemOps; + Value target = connectionOp.getTarget(); + auto targetObjFifoLikeOp = + dyn_cast_if_present( + target.getDefiningOp()); + if (!targetObjFifoLikeOp) { + return connectionOp.emitOpError() + << "expected target to be an logical objFifo-like op"; + } + if (targetObjFifoLikeOp.getMemorySpaceAsUInt() == 0) { + for (AMDAIE::ChannelOp channel : consumerChannels) { + AIE::ShimDMAAllocationOp shimDmaAllocOp = createShimDmaAllocation( + deviceBlock, channel.getTileOp(), AIE::DMAChannelDir::S2MM, + channel.getValue(), targetObjFifoLikeOp.getMemrefType(), + connectionIndex); + targetMemOps.push_back(shimDmaAllocOp.getOperation()); + } + } else { + auto targetObjFifo = + dyn_cast_if_present( + target.getDefiningOp()); + if (!targetObjFifo) { + return connectionOp.emitOpError() + << "expected target to be an " + "`amdaie.logicalobjectfifo.from_buffers` op"; + } + std::optional maybeSize = maybeNpuDmaUserOp->getTargetStaticSize(); + if (!maybeSize) { + return maybeNpuDmaUserOp->emitOpError() + << "could not compute a static access size for source"; + } + std::optional maybeOffset = + maybeNpuDmaUserOp->getTargetStaticBaseOffset(); + if (!maybeOffset) { + return maybeNpuDmaUserOp->emitOpError() + << "could not compute a static base offset for source"; + } + AIE::BDDimLayoutArrayAttr dims = convertSizeStrideToBDDimLayoutArrayAttr( + maybeNpuDmaUserOp->getTargetMixedSizes(), + maybeNpuDmaUserOp->getTargetMixedStrides()); + SmallVector objFifoProducers = + targetObjFifo.getCopyLikeProducers(); + SmallVector objFifoConsumers = + targetObjFifo.getCopyLikeConsumers(); + // Default acquire/release value is 1. Will be adjusted depending on number + // of producers/consumers. + int acqNum{1}; + if (objFifoProducers.size() < objFifoConsumers.size()) { + assert(objFifoConsumers.size() % objFifoProducers.size() == 0); + acqNum = objFifoConsumers.size() / objFifoProducers.size(); + } + for (AMDAIE::ChannelOp channel : consumerChannels) { + Operation *memOp = tileToMemOpMap.at(channel.getTile()); + AMDAIE::TileOp tileOp = channel.getTileOp(); + SmallVector buffers = llvm::map_to_vector( + targetObjFifo.getBuffersOnTile(tileOp), + [&](AMDAIE::BufferOp bufferOp) { + return cast(mapper.lookup(bufferOp.getOperation())); + }); + SmallVector producerLocks = llvm::map_to_vector( + targetObjFifo.getProducerLocksOnTile(tileOp), + [&](AMDAIE::LockOp lockOp) { + return cast(mapper.lookup(lockOp.getOperation())); + }); + SmallVector consumerLocks = llvm::map_to_vector( + targetObjFifo.getConsumerLocksOnTile(tileOp), + [&](AMDAIE::LockOp lockOp) { + return cast(mapper.lookup(lockOp.getOperation())); + }); + if (producerLocks.size() != 1) { + return targetObjFifo.emitOpError() + << "expected a single producer lock for tile: " + << channel.getTile(); + } + if (consumerLocks.size() != 1) { + return targetObjFifo.emitOpError() + << "expected a single consumer lock for tile: " + << channel.getTile(); + } + std::pair lockPair = + std::make_pair(producerLocks[0], consumerLocks[0]); + rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); + createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims, + acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, + lockPair); + } + } + + // Keep track of source/target mem ops for this connection for later retrieval + // to create NPU ops. + connectionToSourceTargetMemOps[connectionOp] = + std::make_pair(sourceMemOps, targetMemOps); + return success(); +} + +LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp, + Block *deviceBlock, int &lockIndex) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n"); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToEnd(deviceBlock); + Value tile = mapper.lookup(lockOp.getTile()); + auto aieLockOp = rewriter.create( + lockOp.getLoc(), tile, lockOp.getValueAttr(), lockOp.getInitValueAttr(), + rewriter.getStringAttr("lock_" + std::to_string(lockIndex++))); + mapper.map(lockOp.getResult(), aieLockOp.getResult()); + mapper.map(lockOp.getOperation(), aieLockOp.getOperation()); + return success(); +} + +template +LogicalResult logicalObjFifoFromBuffersToMemOp( + IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo, + IRMapping &mapper, Block *deviceBlock, + DenseMap &tileToMemOpMap) { + LLVM_DEBUG( + llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoFromBuffersOp]\n"); + OpBuilder::InsertionGuard guard(rewriter); + SmallVector consumers = + logicalObjFifo.getCopyLikeConsumers(); + SmallVector producers = + logicalObjFifo.getCopyLikeProducers(); + if (producers.size() > 1 && consumers.size() > 1) { + return logicalObjFifo.emitOpError() + << "has a multi-producer, multi-consumer DMA " + "pattern, which is currently not supported"; + } + // Create a memory op for every unique tile and fill it with DMA ops. + for (Value tile : logicalObjFifo.getTiles()) { + if (tileToMemOpMap.contains(tile)) continue; + Value aieTile = mapper.lookup(tile); + rewriter.setInsertionPointToEnd(deviceBlock); + auto newMemOp = rewriter.create(rewriter.getUnknownLoc(), aieTile); + rewriter.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock()); + rewriter.create(rewriter.getUnknownLoc()); + // Keep track of the MemOps on different tiles. + tileToMemOpMap[tile] = newMemOp.getOperation(); + } + return success(); +} + +LogicalResult AIEDeviceBuilder::logicalObjFifoFromBuffersToAIE( + AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo, Block *deviceBlock) { + LLVM_DEBUG( + llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoFromBuffersOp]\n"); + uint8_t memSpaceUInt = logicalObjFifo.getMemorySpaceAsUInt(); + if (memSpaceUInt == 1) { + // L2 + return logicalObjFifoFromBuffersToMemOp( + rewriter, logicalObjFifo, mapper, deviceBlock, tileToMemOpMap); + } else if (memSpaceUInt == 2) { + // L1 + return logicalObjFifoFromBuffersToMemOp( + rewriter, logicalObjFifo, mapper, deviceBlock, tileToMemOpMap); + } else { + return logicalObjFifo.emitOpError() + << "has unsupported memory space for lowering to AIE: " + << std::to_string(memSpaceUInt); } - rewriter.create( - rewriter.getUnknownLoc(), rewriter.getArrayAttr(inSyms), - rewriter.getArrayAttr(outSyms), rewriter.getArrayAttr({}), - rewriter.getArrayAttr({})); return success(); } //===----------------------------------------------------------------------===// -// Convert amdaie.tile operation to aie.tile +// Convert `amdaie.tile` operation to `aie.tile` //===----------------------------------------------------------------------===// -LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp, - IRMapping &mapper, Block *deviceBlock) { +LogicalResult AIEDeviceBuilder::tileToAIE(AMDAIE::TileOp tileOp, + Block *deviceBlock) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::TileOp]\n"); OpBuilder::InsertionGuard guard(rewriter); int64_t col = getConstantIntValue(tileOp.getCol()).value(); @@ -757,18 +869,18 @@ LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp, // Convert amdaie.workgroup operation and insert into aie.device //===----------------------------------------------------------------------===// -LogicalResult workgroupToAIE(IRRewriter &rewriter, - AMDAIE::WorkgroupOp workgroupOp, - xilinx::AIE::DeviceOp deviceOp, - xilinx::AIEX::RuntimeSequenceOp npuFuncOp, - IRMapping &mapper, IRMapping &bindingsMapper) { +LogicalResult AIEDeviceBuilder::workgroupToAIE( + AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp, + xilinx::AIEX::RuntimeSequenceOp npuFuncOp) { OpBuilder::InsertionGuard guard(rewriter); Block *deviceBlock = &deviceOp.getRegion().front(); Block *deviceCoreBlock = rewriter.createBlock(&deviceOp.getRegion()); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); // Walk all operations in the AIE region and convert to AIE ops - int dmaId = 0; + int bufferId{0}; + int lockId{0}; + int connectionIndex{0}; WalkResult res = workgroupOp.walk([&](Operation *op) { return TypeSwitch(op) .Case([&](auto bdIdOp) { @@ -776,49 +888,78 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter, // so don't convert to AIE dialect. return WalkResult::advance(); }) + .Case([&](auto bufferOp) { + if (failed(bufferToAIE(bufferOp, deviceBlock, bufferId))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }) + .Case([&](auto channelOp) { + // Channel ops are purely used for retrieving information in other ops + // so don't convert to AIE dialect. + return WalkResult::advance(); + }) .Case([&](auto dmaOp) { dmaOp.emitOpError() << "`amdaie.circular_dma_cpy_nd` unsupported in lowering to AIE"; return WalkResult::interrupt(); }) .Case([&](auto dmaOp) { - if (failed(flowToAIE(rewriter, dmaOp, mapper, deviceBlock, dmaId))) { + if (failed(connectionToAIE(dmaOp, deviceBlock, connectionIndex))) { return WalkResult::interrupt(); } return WalkResult::advance(); }) .Case([&](auto controlCodeOp) { - if (failed(controlCodeToAie(rewriter, controlCodeOp, npuFuncOp, - mapper, bindingsMapper))) { + if (failed(controlCodeToAIE(controlCodeOp, npuFuncOp))) { controlCodeOp.emitError("could not convert to AIEDialect ops"); return WalkResult::interrupt(); } return WalkResult::skip(); }) .Case([&](auto coreOp) { - if (failed(coreToAIE(rewriter, coreOp, mapper, deviceOp, - deviceCoreBlock))) { + if (failed(coreToAIE(coreOp, deviceOp, deviceCoreBlock))) { coreOp.emitError("could not convert to AIEDialect ops"); return WalkResult::interrupt(); } return WalkResult::skip(); }) - .Case([&](auto linkOp) { - if (failed(linkToAIE(rewriter, linkOp, mapper, deviceBlock))) { + .Case([&](auto lockOp) { + if (failed(lockToAIE(lockOp, deviceBlock, lockId))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }) + .Case([&](auto logicalObjFifo) { + if (failed(logicalObjFifoFromBuffersToAIE(logicalObjFifo, + deviceBlock))) { return WalkResult::interrupt(); } return WalkResult::advance(); }) + .Case([&](auto logicalObjFifo) { + // Skip placeholder ops as they don't have an equivalent in the + // AIE dialect and shim dma allocations are created from + // connections directly currently. + return WalkResult::advance(); + }) .Case([&](auto tileOp) { - if (failed(tileToAIE(rewriter, tileOp, mapper, deviceBlock))) { + if (failed(tileToAIE(tileOp, deviceBlock))) { return WalkResult::interrupt(); } return WalkResult::advance(); }) + .Case([&](auto workgroupOp) { + // Skip workgroup ops themselves. + return WalkResult::advance(); + }) .Default([&](Operation *op) { rewriter.setInsertionPointToEnd(deviceBlock); if (!isa_and_present(op->getDialect())) { rewriter.clone(*op, mapper); + } else { + op->emitOpError() << "is unsupported in lowering to AIE dialect"; + return WalkResult::interrupt(); } return WalkResult::advance(); }); @@ -838,8 +979,7 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter, /// `AIE::DeviceOp` into the module for every encountered `FuncOp`, and then /// traverse the function build the AIE device operation and convert all AMDAIE /// dialect operations to AIE dialect operations. -LogicalResult lowerToAIE(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); +LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { Block *moduleBlock = &moduleOp->getRegion(0).front(); // Retrieve the AMDAIEDevice from the executable target attribute. @@ -868,7 +1008,6 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { // of the aiex.runtime_sequence operation that replaces the // amdaie.controlcode. The HAL interface bindings are used to // order the function parameters correctly. - IRMapping bindingsMapper; SmallVector subspanOps; funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) { subspanOps.push_back(subspanOp); @@ -891,14 +1030,13 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { } // Walk the AIE regions ops and convert ops into pure AIEDialect ops. - IRMapping mapper; + // IRMapping mapper; rewriter.setInsertionPointToStart(deviceBlock); WalkResult res = funcOp.walk([&](Operation *op) { if (isa(op)) { return WalkResult::advance(); } else if (auto workgroupOp = dyn_cast(op)) { - if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, npuFuncOp, - mapper, bindingsMapper))) { + if (failed(workgroupToAIE(workgroupOp, deviceOp, npuFuncOp))) { return WalkResult::interrupt(); } return WalkResult::skip(); @@ -915,7 +1053,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end()); // After walking the FuncOp, it has been converted into a DeviceOp and can // safely be erased. - eraseOp(rewriter, mapper, funcOp); + eraseOp(funcOp); return WalkResult::advance(); }); if (funcRes.wasInterrupted()) return failure(); @@ -950,14 +1088,16 @@ class AMDAIELowerToAIEPass : public impl::AMDAIELowerToAIEBase { public: void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } void runOnOperation() override { // Main function call to convert all operations into AIE dialect // operations inside an AIE device. - if (failed(lowerToAIE(getOperation()))) return signalPassFailure(); + ModuleOp moduleOp = getOperation(); + AIEDeviceBuilder builder(moduleOp.getContext()); + if (failed(builder.lowerToAIE(moduleOp))) return signalPassFailure(); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h new file mode 100644 index 000000000..871d0961f --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h @@ -0,0 +1,132 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements lowering from the AMDAIE dialect to AIE and AIEX +// dialects. +// +//===----------------------------------------------------------------------===// + +#ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_ +#define IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_ + +#include "aie/AIEDialect.h" +#include "aie/AIEXDialect.h" +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "llvm/ADT/DenseMap.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" + +using namespace xilinx; + +namespace mlir::iree_compiler::AMDAIE { + +/// Class to build an `aie.device` from a `module` containing +/// `amdaie.workgroup`. +class AIEDeviceBuilder { + public: + AIEDeviceBuilder(MLIRContext *ctx) : rewriter(ctx) {} + + LogicalResult lowerToAIE(ModuleOp moduleOp); + + private: + /// Core op conversion methods. + LogicalResult coreMemrefExtractStridedMetadataToAIE( + memref::ExtractStridedMetadataOp extractStridedMetadataOp, + SmallVector &toBeErased); + LogicalResult coreFuncCallOpToAIE(func::CallOp oldCallOp, + SmallVector &toBeErased); + LogicalResult coreUseLockToAIE(AMDAIE::UseLockOp useLockOp, + SmallVector &toBeErased); + LogicalResult coreToAIE(AMDAIE::CoreOp coreOp, AIE::DeviceOp deviceOp, + Block *deviceCoreBlock); + + /// Controlcode ops conversion methods. + LogicalResult npuDmaCpyNdOpToAIE(AMDAIE::NpuDmaCpyNdOp dmaOp, + SmallVector &toBeErased); + LogicalResult npuDmaWaitToAIE(AMDAIE::NpuDmaWaitOp waitOp, + SmallVector &toBeErased); + LogicalResult controlCodeToAIE(AMDAIE::ControlCodeOp controlCodeOp, + xilinx::AIEX::RuntimeSequenceOp funcOp); + + /// Workgroup ops conversion methods. + LogicalResult bufferToAIE(AMDAIE::BufferOp bufferOp, Block *deviceBlock, + int &bufferId); + LogicalResult connectionToAIE(AMDAIE::ConnectionOp connectionOp, + Block *deviceBlock, int &connectionIndex); + LogicalResult lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, + int &lockIndex); + LogicalResult logicalObjFifoFromBuffersToAIE( + AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo, + Block *deviceBlock); + LogicalResult tileToAIE(AMDAIE::TileOp tileOp, Block *deviceBlock); + LogicalResult workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp, + xilinx::AIE::DeviceOp deviceOp, + xilinx::AIEX::RuntimeSequenceOp npuFuncOp); + + /// Utilities + + /// Utility to convert vectors of `size` and `stride` into an + /// `AIE::BDDimLayoutArrayAttr`. + AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr( + const SmallVector &sizes, + const SmallVector &strides); + + /// Utility to create DMA blocks and add them to `memOp`. + void createDMA(Operation *memOp, AIE::DMAChannelDir channelDir, + int channelIndex, AIE::BDDimLayoutArrayAttr dims, + size_t acqNum, size_t relNum, int64_t len, int64_t offset, + const SmallVector &bufferOps, + const std::pair &locks); + + /// Utility to create `aie.shim_dma_allocation` ops and corresponding global + /// symbols. + AIE::ShimDMAAllocationOp createShimDmaAllocation( + Block *deviceBlock, AMDAIE::TileOp tileOp, + AIE::DMAChannelDir dmaChannelDir, uint8_t channel, MemRefType memrefType, + int &connectionIndex); + + /// It is dangerous to erase ops with `rewriter` without erasing them from + /// `mapper` too, as addresses of Operations/Values can be reused, resulting + /// in unexpected key-value pairs in `mapper`. Use this utility if `mapper` + /// might be used after `op` is erased. + void eraseOp(Operation *op); + + /// Utility to fold linear dims, unit dims and single dims in the provided + /// `offsets`, `sizes` and `strides` access patterns. + void foldDims(const SmallVector &offsets, + const SmallVector &sizes, + const SmallVector &strides, + SmallVector &newOffsets, + SmallVector &newSizes, + SmallVector &newStrides); + + /// Utility to remap the provided operation's operands. + void remapOperands(Operation *op); + + /// Members + + IRRewriter rewriter; + IRMapping mapper; + /// Dedicated mapper for the HAL bindings. + IRMapping bindingsMapper; + /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`). + /// This is used to look up and add new DMA patterns to those memory ops. + DenseMap tileToMemOpMap; + /// Map from connections to source and target AIE memory ops (`aie.mem` or + /// `aie.memtile_dma`, or `aie.shim_dma_allocation`). This is mainly used for + /// looking up the global symbols from `aie.shim_dma_allocation` ops needed + /// to create AIEX NPU ops. + DenseMap, SmallVector>> + connectionToSourceTargetMemOps; +}; + +} // namespace mlir::iree_compiler::AMDAIE + +#endif // IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 002a9bcec..7fdad4b25 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -39,12 +39,14 @@ iree_cc_library( "Passes.h" "AMDAIECreateAIEWorkgroup.h" "AMDAIEDmaUtils.h" + "AMDAIELowerToAIE.h" "AMDAIEOpUtils.h" "AMDAIEUtils.h" "Transforms.h" SRCS "AMDAIEAccessToAcquireRelease.cpp" "AMDAIEAddLoweringStrategy.cpp" + "AMDAIEAcquireReleaseToUseLock.cpp" "AMDAIEAssignChannels.cpp" "AMDAIEAssignLogicalObjectFifoDepth.cpp" "AMDAIEAssignNpuDmaBdIds.cpp" @@ -54,7 +56,6 @@ iree_cc_library( "AMDAIECombineStridedOps.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" - "AMDAIECoreLoopUnroll.cpp" "AMDAIECreateAIEWorkgroup.cpp" "AMDAIECreateLogicalObjectFifoLink.cpp" "AMDAIECreateReferenceToAllocation.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 1e0ba9bfa..06172300d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -21,6 +21,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DECL #define GEN_PASS_DEF_AMDAIEACCESSTOACQUIRERELEASE +#define GEN_PASS_DEF_AMDAIEACQUIRERELEASETOUSELOCK #define GEN_PASS_DEF_AMDAIEASSIGNCHANNELS #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS @@ -32,7 +33,6 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR -#define GEN_PASS_DEF_AMDAIECORELOOPUNROLL #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP #define GEN_PASS_DEF_AMDAIECREATELOGICALOBJECTFIFOLINK #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index c1309c9b9..e2797cb6a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -142,6 +142,7 @@ static void addAMDAIEBufferizePasses(OpPassManager &pm) { } void addAMDAIEToAIEPasses(OpPassManager &passManager) { + passManager.addPass(createAMDAIEAcquireReleaseToUseLockPass()); passManager.addPass(createAMDAIECanonicalizeNpuDmaCpyNdPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIESinkIntoCorePass()); @@ -625,18 +626,19 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCSEPass()); - passManager.addPass(createAMDAIECreateLogicalObjectFifoLinkPass()); + // passManager.addPass(createAMDAIECreateLogicalObjectFifoLinkPass()); passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEConvertCoreForallToForPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIECoreLoopUnrollPass()); passManager.addPass(createAMDAIEAssignChannelsPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEObjFifoBufferizationPass()); + addAMDAIEToAIEPasses(passManager); // Now lower using the AIE passes from MLIR-AIE. @@ -819,10 +821,8 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { void addMLIRAIELoweringPasses(OpPassManager &passManager) { { OpPassManager &devicePM = passManager.nest(); - devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass()); devicePM.addPass(createCanonicalizerPass()); devicePM.addPass(createAMDAIEDmaToNpuPass()); - devicePM.addPass(createAMDAIEAssignLockIDsPass()); devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); devicePM.addPass(createAMDAIEPathfinderPass()); @@ -839,7 +839,6 @@ void addMLIRAIELoweringPasses(OpPassManager &passManager) { devicePM.addPass(createAMDAIENormalizeAddressSpacesPass()); devicePM.addPass(createCanonicalizerPass()); } - } // NOTE: this runs on the top-level program module containing all hal.executable diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index c3867d009..8039fe5a2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -53,6 +53,10 @@ void buildAMDAIELinkingPassPipeline(OpPassManager &passManager); /// semaphore operations. std::unique_ptr createAMDAIEAccessToAcquireReleasePass(); +/// Create a pass to convert logical objectFifo acquire/release ops to +/// `amdaie.use_lock` +std::unique_ptr createAMDAIEAcquireReleaseToUseLockPass(); + /// Create a pass to assign channels to connections. std::unique_ptr createAMDAIEAssignChannelsPass(); @@ -106,10 +110,6 @@ std::unique_ptr createAMDAIECleanupPass(); /// are compatible. std::unique_ptr createAMDAIECombineStridedOpsPass(); -/// Create a pass to unroll `scf.for` with synchronization ops based on -/// objectFifo buffer depths. -std::unique_ptr createAMDAIECoreLoopUnrollPass(); - /// Create a pass decomposing iree_linalg_ext.pack and unpack ops to AIR /// dialect. std::unique_ptr createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 339ed9651..f1d7f54d4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -17,6 +17,12 @@ def AMDAIEAccessToAcquireRelease : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAccessToAcquireReleasePass()"; } +def AMDAIEAcquireReleaseToUseLock : + Pass<"iree-amdaie-acquire-release-to-use-lock", ""> { + let summary = "Convert acquire/release synchronization stubs to `amdaie.use_lock`"; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAcquireReleaseToUseLockPass()"; +} + def AMDAIEAssignChannels : Pass<"iree-amdaie-assign-channels", ""> { let summary = "Assign channels to `amdaie.connection` ops."; @@ -128,13 +134,6 @@ def AMDAIEConvertCoreForallToFor : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConvertCoreForallToForPass()"; } -def AMDAIECoreLoopUnroll : - Pass<"iree-amdaie-core-loop-unroll", ""> { - let summary = "Within core ops, unroll `scf.for` with synchronization ops based on " - "objectFifo buffer depths."; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECoreLoopUnrollPass()"; -} - def AMDAIECreateAIEWorkgroup : Pass<"iree-amdaie-create-aie-workgroup", "func::FuncOp"> { let summary = "Creates a single AIE workgroup."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 61071df29..8a86c6e82 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -10,6 +10,7 @@ iree_lit_test_suite( SRCS "access_to_acquire_release.mlir" "aie_link_executables.mlir" + "acquire_release_to_use_lock.mlir" "assign_channels.mlir" "assign_logical_objectfifo_depth.mlir" "assign_npu_dma_bd_ids.mlir" @@ -20,7 +21,6 @@ iree_lit_test_suite( "combine_strided_ops.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" - "core_loop_unroll.mlir" "create_aie_workgroup.mlir" "create_logical_objectfifo_link.mlir" "create_reference_to_allocation.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir new file mode 100644 index 000000000..7da636291 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir @@ -0,0 +1,214 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-acquire-release-to-use-lock,canonicalize,cse))" --split-input-file %s | FileCheck %s + +// CHECK-LABEL: @depth_1 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0)) +// CHECK: %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1)) +// CHECK: amdaie.core +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C1]] { +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER]] +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1)) +// CHECK: } +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @depth_1() { + amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %lock = amdaie.lock(%tile(0)) + %lock_2 = amdaie.lock(%tile(1)) + %buffer_1 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %lock_5 = amdaie.lock(%tile_0(0)) + %lock_6 = amdaie.lock(%tile_0(1)) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %3 = amdaie.core(%tile_0, in : [], out : [%2]) { + scf.for %arg0 = %c0 to %c4 step %c1 { + %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2 : i32> + %reinterpret_cast = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<32x32xi32, 2 : i32>) + amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32} + } + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @depth_2 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[BUFFER_1:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0)) +// CHECK: %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1)) +// CHECK: amdaie.core +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C2]] { +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER]] +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1)) +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER_1]] +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1)) +// CHECK: } +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @depth_2() { + amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %lock = amdaie.lock(%tile(0)) + %lock_2 = amdaie.lock(%tile(1)) + %buffer_3 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %lock_5 = amdaie.lock(%tile_0(0)) + %lock_6 = amdaie.lock(%tile_0(1)) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> + %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %3 = amdaie.core(%tile_0, in : [], out : [%2]) { + scf.for %arg0 = %c0 to %c4 step %c1 { + %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2 : i32> + %reinterpret_cast = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<32x32xi32, 2 : i32>) + amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32} + } + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @depth_4 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C21:.+]] = arith.constant 21 : index +// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[BUFFER_1:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[BUFFER_2:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[BUFFER_3:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32> +// CHECK: %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0)) +// CHECK: %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1)) +// CHECK: amdaie.core +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C17:.+]] = arith.constant 17 : index +// CHECK: scf.for %[[ARG0:.+]] = %[[C1]] to %[[C17]] step %[[C8]] { +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER]] +// CHECK: index_cast +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1) +// CHECK: arith.addi %[[ARG0]], %[[C2]] : index +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER_1]] +// CHECK: index_cast +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1) +// CHECK: arith.addi %[[ARG0]], %[[C4]] : index +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER_2]] +// CHECK: index_cast +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1) +// CHECK: arith.addi %[[ARG0]], %[[C6]] : index +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER_3]] +// CHECK: index_cast +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1) +// CHECK: } +// CHECK: scf.for %[[ARG1:.+]] = %[[C17]] to %[[C21]] step %[[C2]] { +// CHECK: amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1)) +// CHECK: memref.reinterpret_cast %[[BUFFER]] +// CHECK: index_cast %[[ARG1]] +// CHECK: linalg.fill +// CHECK: amdaie.use_lock(%[[LOCK_1]], Release(1) +// CHECK: } +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @depth_4() { + amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c21 = arith.constant 21 : index + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32> + %lock = amdaie.lock(%tile(0)) + %lock_2 = amdaie.lock(%tile(1)) + %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %buffer_6 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %buffer_7 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32> + %lock_5 = amdaie.lock(%tile_0(0)) + %lock_6 = amdaie.lock(%tile_0(1)) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1, %buffer_2, %buffer_3}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> + %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) + %3 = amdaie.core(%tile_0, in : [], out : [%2]) { + scf.for %arg0 = %c1 to %c21 step %c2 { + %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2 : i32> + %6 = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32> + %c = arith.index_cast %arg0 : index to i32 + linalg.fill ins(%c : i32) outs(%6 : memref<32x32xi32, 2 : i32>) + amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32} + } + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir deleted file mode 100644 index 4bb7ccd22..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir +++ /dev/null @@ -1,181 +0,0 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-core-loop-unroll,canonicalize))" --split-input-file %s | FileCheck %s - -// No change for depth 1. - -// CHECK-LABEL: @depth_1 -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: amdaie.core -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C1]] { -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: linalg.fill -// CHECK: } -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @depth_1() { - amdaie.workgroup { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) { - scf.for %arg0 = %c0 to %c4 step %c1 { - %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2> - linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 2>) - } - amdaie.end - } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - amdaie.controlcode { - amdaie.end - } - } - return - } -} - -// ----- - -// CHECK-LABEL: @depth_2 -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: amdaie.core -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C4]] { -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: linalg.fill -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: linalg.fill -// CHECK: } -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @depth_2() { - amdaie.workgroup { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c8 = arith.constant 8 : index - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo, 2> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo, 2> - %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) { - scf.for %arg0 = %c0 to %c8 step %c2 { - %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo, 2> - %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo, 2> -> memref<1024xi32, 2> - %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2> - linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 2>) - } - amdaie.end - } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - amdaie.controlcode { - amdaie.end - } - } - return - } -} - -// ----- - -// CHECK-LABEL: @depth_4 -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C17:.+]] = arith.constant 17 : index -// CHECK-DAG: %[[C21:.+]] = arith.constant 21 : index -// CHECK-DAG: amdaie.core -// CHECK: scf.for %[[ARG0:.+]] = %[[C1]] to %[[C17]] step %[[C8]] { -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: index_cast -// CHECK: linalg.fill -// CHECK: arith.addi %[[ARG0]], %[[C2]] : index -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: index_cast -// CHECK: linalg.fill -// CHECK: arith.addi %[[ARG0]], %[[C4]] : index -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: index_cast -// CHECK: linalg.fill -// CHECK: arith.addi %[[ARG0]], %[[C6]] : index -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: index_cast -// CHECK: linalg.fill -// CHECK: } -// CHECK: scf.for %[[ARG1:.+]] = %[[C17]] to %[[C21]] step %[[C2]] { -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK: amdaie.logicalobjectfifo.access -// CHECK: memref.reinterpret_cast -// CHECK: index_cast %[[ARG1]] -// CHECK: linalg.fill -// CHECK: } -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @depth_4() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %c16 = arith.constant 21 : index - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo, 4> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo, 4> - %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) { - scf.for %arg0 = %c1 to %c16 step %c2 { - %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo, 4> - %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo, 4> -> memref<1024xi32, 2> - %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2> - %c = arith.index_cast %arg0 : index to i32 - linalg.fill ins(%c : i32) outs(%2 : memref<32x32xi32, 2>) - } - amdaie.end - } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - amdaie.controlcode { - amdaie.end - } - } - return - } -} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 45cbb3506..49e52c6e1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -25,6 +25,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +//===----------------------------------------------------------------------===// +// Workgroup tests +//===----------------------------------------------------------------------===// + // CHECK: module // CHECK: aie.device // CHECK: aiex.runtime_sequence @workgroup @@ -68,23 +72,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// NOTE: Due to an AIE check that verifies whether aie.objectfifo is linked correctly, -// this test checks two `amdaie.connection` operations, so they can be linked -// correctly. -// -// CHECK: aie.device -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_0_0:.+]] = aie.tile(0, 0) -// CHECK: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]], {%[[TILE_0_1]]} -// CHECK-NEXT: aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]} -// CHECK-NEXT: aie.objectfifo.link -// CHECK-SAME: @[[OBJ0]] -// CHECK-SAME: @[[OBJ1]] -// CHECK: aiex.runtime_sequence @connections_and_link +// CHECK: module +// CHECK: aie.device +// CHECK-DAG: aie.tile(0, 2) +// CHECK-DAG: aie.tile(0, 1) +// CHECK-DAG: aie.tile(0, 0) +// CHECK: aiex.runtime_sequence @tile #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @connections_and_link() { + func.func @tile() { amdaie.workgroup { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -92,21 +88,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - memref.dealloc %alloc_0 : memref<32x64xi32> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) amdaie.end } } @@ -116,49 +98,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// NOTE: Due to an AIE check that verifies whether aie.objectfifo is linked correctly, -// this test checks two `amdaie.connection` operations, so they can be linked -// correctly. -// -// CHECK: aie.device -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_0_0:.+]] = aie.tile(0, 0) -// CHECK: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]] toStream [, , ] -// CHECK-NEXT: aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]} -// CHECK-NEXT: aie.objectfifo.link -// CHECK-SAME: @[[OBJ0]] -// CHECK-SAME: @[[OBJ1]] -// CHECK: aiex.runtime_sequence @circular_dma_cpy_sizes_and_strides +// CHECK: module +// CHECK: aie.device +// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) +// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) +// CHECK-DAG: aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK-DAG: aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32> +// CHECK: aiex.runtime_sequence @buffer #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @circular_dma_cpy_sizes_and_strides() { + func.func @buffer() { amdaie.workgroup { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %c8 = arith.constant 8 : index - %c32 = arith.constant 32 : index - %c256 = arith.constant 256 : index - %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - memref.dealloc %alloc_0 : memref<32x64xi32> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [%c0, %c0, %c0] [%c32, %c4, %c8] [%c8, %c256, %c1]) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) amdaie.end } } @@ -168,52 +126,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a -// core, it's hard to create a very small minimal test. -// -// CHECK: aie.device -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %{{.+}} = aie.tile(0, 1) -// CHECK-DAG: %{{.+}} = aie.tile(0, 0) -// CHECK: aie.core(%[[TILE_0_2]]) -// CHECK: %[[ACQUIRE:.+]] = aie.objectfifo.acquire -// CHECK-SAME: Produce -// CHECK: %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]] -// CHECK: %[[REINTERPRET:.+]] = memref.reinterpret_cast %[[ACCESS]] -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[REINTERPRET]] : memref<32x32xi32, 1>) -// CHECK: aiex.runtime_sequence @tile_and_core_and_acquire +// CHECK: module +// CHECK: aie.device +// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) +// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) +// CHECK-DAG: aie.lock(%[[TILE_0_1]], 4) {init = 8 : i8, sym_name = "lock_0"} +// CHECK-DAG: aie.lock(%[[TILE_0_2]], 5) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: aiex.runtime_sequence @lock #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @tile_and_core_and_acquire() { + func.func @lock() { amdaie.workgroup { - %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0]) { - %0 = amdaie.logicalobjectfifo.acquire(%conn0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 1> - %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 1> to memref<32x32xi32, 1> - linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 1>) - amdaie.end - } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - memref.dealloc %alloc_0 : memref<32x64xi32> + %lock = amdaie.lock(%tile_0_1(4), 8) + %lock_1 = amdaie.lock(%tile_0_2(5), 0) amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) amdaie.end } } @@ -223,75 +154,59 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK: aie.device -// CHECK-DAG: func.func private @ukernel_A(memref, index) attributes {llvm.bareptr = true} -// CHECK-DAG: func.func private @ukernel_B(memref, index, memref, index) attributes {llvm.bareptr = true} -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK: aie.core(%[[TILE_0_2]]) -// CHECK: %[[ACQUIRE:.+]] = aie.objectfifo.acquire -// CHECK-SAME: Produce -// CHECK: %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]] -// CHECK: %[[REINTERPRET:.+]] = memref.reinterpret_cast %[[ACCESS]] -// CHECK: %[[ACQUIRE0:.+]] = aie.objectfifo.acquire -// CHECK-SAME: Produce -// CHECK: %[[ACCESS0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE0]] -// CHECK: %[[REINTERPRET0:.+]] = memref.reinterpret_cast %[[ACCESS0]] -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[REINTERPRET]] : memref<32x32xi32, 2>) -// CHECK: %[[BASE_BUFFER:.*]], %{{.+}}, %{{.+}}:2, %{{.+}}:2 = memref.extract_strided_metadata %[[REINTERPRET]] : -// CHECK: %[[BASE_BUFFER0:.*]], %{{.+}}, %{{.+}}:2, %{{.+}}:2 = memref.extract_strided_metadata %[[REINTERPRET0]] : -// CHECK: func.call @ukernel_A(%[[BASE_BUFFER]], %[[C0]]) : (memref, index) -> () -// CHECK: func.call @ukernel_B(%[[BASE_BUFFER]], %[[C0]], %[[BASE_BUFFER0]], %[[C0]]) : (memref, index, memref, index) -> () -// CHECK: aie.end -// CHECK: } {link_with = "/path/to/ukernel.o"} -// CHECK: aiex.runtime_sequence @lower_to_aie_ukernel +// CHECK: aie.device +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @single_connection_single_buffer #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func private @ukernel_A(memref, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true} - func.func private @ukernel_B(memref, index, memref, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true} - func.func @lower_to_aie_ukernel() { + func.func @single_connection_single_buffer() { amdaie.workgroup { - %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 2> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 1> - %alloc_3 = memref.alloc() : memref<32x32xf32, 2> - %alloc_4 = memref.alloc() : memref<4x8x4x8xf32, 1> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 2> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 1> -> !amdaie.logicalobjectfifo> - %obj3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_0_1} : memref<32x32xf32, 2> -> !amdaie.logicalobjectfifo> - %obj4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_0_2} : memref<4x8x4x8xf32, 1> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj3, %obj4) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0, %conn1]) { - %0 = amdaie.logicalobjectfifo.acquire(%conn0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %reinterpret_0 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2> - %2 = amdaie.logicalobjectfifo.acquire(%conn1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.access(%2, Write) : !amdaie.logicalobjectfifo> -> memref<1024xf32, 2> - %reinterpret_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xf32, 2> to memref<32x32xf32, 2> - linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_0 : memref<32x32xi32, 2>) - %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %reinterpret_0 : memref<32x32xi32, 2> -> memref, index, index, index, index, index - %base_buffer0, %offset0, %sizes0:2, %strides0:2 = memref.extract_strided_metadata %reinterpret_1 : memref<32x32xf32, 2> -> memref, index, index, index, index, index - func.call @ukernel_A(%base_buffer, %c0) : (memref, index) -> () - func.call @ukernel_B(%base_buffer, %c0, %base_buffer0, %c0) : (memref, index, memref, index) -> () - amdaie.end - } {link_with = "/path/to/ukernel.o"} - memref.dealloc %alloc_4 : memref<4x8x4x8xf32, 1> - memref.dealloc %alloc_3 : memref<32x32xf32, 2> - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 1> - memref.dealloc %alloc_1 : memref<32x32xi32, 2> - memref.dealloc %alloc_0 : memref<32x64xi32> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(0), 1) + %lock_3 = amdaie.lock(%tile_0_2(1), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_1, 0) + %channel_1 = amdaie.channel(%tile_0_2, 0) + %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) + %3 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -301,59 +216,322 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- - -// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a -// core, it's hard to create a very small minimal test. -// -// CHECK: aie.device -// CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %{{.+}} = aie.tile(0, 1) -// CHECK-DAG: %{{.+}} = aie.tile(0, 0) -// CHECK: aie.core(%[[TILE_0_2]]) -// CHECK: %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire -// CHECK-SAME: Consume -// CHECK: aie.objectfifo.subview.access -// CHECK-SAME: %[[ACQUIRE_0]] -// CHECK: aie.core(%[[TILE_1_2]]) -// CHECK: %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire -// CHECK-SAME: Consume -// CHECK: aie.objectfifo.subview.access -// CHECK-SAME: %[[ACQUIRE_1]] -// CHECK: aiex.runtime_sequence @tile_and_core_and_acquire_broadcast +// CHECK: aie.device(npu1_4col) +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {dimensions = #aie, ]>, len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {dimensions = #aie, ]>, len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @single_connection_multi_buffer #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @tile_and_core_and_acquire_broadcast() { + func.func @single_connection_multi_buffer() { amdaie.workgroup { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %tile_1_2 = amdaie.tile(%c1, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj0) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj2, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_2 = amdaie.core(%tile_0_2, in : [%conn1], out : []) { - %0 = amdaie.logicalobjectfifo.acquire(%conn1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 2) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_2 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %buffer_3 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(0), 2) + %lock_3 = amdaie.lock(%tile_0_2(1), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> + %channel = amdaie.channel(%tile_0_1, 0) + %channel_1 = amdaie.channel(%tile_0_2, 0) + %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + amdaie.controlcode { + %3 = amdaie.npu.circular_dma_cpy_nd %2([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1]) amdaie.end } - %core_1_2 = amdaie.core(%tile_1_2, in : [%conn1], out : []) { - %0 = amdaie.logicalobjectfifo.acquire(%conn1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + } + return + } +} + +// ----- + +// CHECK: aie.device(npu1_4col) +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: %[[BUFFER_0_1_2:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_2"} : memref<2048xi32, 1 : i32> +// CHECK: %[[LOCK_0_1_3:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_4"} +// CHECK: %[[LOCK_0_1_4:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_5"} +// CHECK: %[[BUFFER_0_2_5:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<2048xi32, 2 : i32> +// CHECK: %[[LOCK_0_2_6:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_6"} +// CHECK: %[[LOCK_0_2_7:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_7"} +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 1) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb3, ^bb4) +// CHECK: ^bb3: +// CHECK: aie.use_lock(%[[LOCK_0_1_4]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_2]] : memref<2048xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_3]], Release, 1) +// CHECK: aie.next_bd ^bb3 +// CHECK: ^bb4: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 1, ^bb3, ^bb4) +// CHECK: ^bb3: +// CHECK: aie.use_lock(%[[LOCK_0_2_6]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_5]] : memref<2048xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_7]], Release, 1) +// CHECK: aie.next_bd ^bb3 +// CHECK: ^bb4: +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @multi_connection_single_buffer +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @multi_connection_single_buffer() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(0), 1) + %lock_3 = amdaie.lock(%tile_0_2(1), 0) + %buffer_2 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %lock_4 = amdaie.lock(%tile_0_1(0), 1) + %lock_5 = amdaie.lock(%tile_0_1(1), 0) + %buffer_3 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32> + %lock_6 = amdaie.lock(%tile_0_2(0), 1) + %lock_7 = amdaie.lock(%tile_0_2(1), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_1, 0) + %channel_1 = amdaie.channel(%tile_0_2, 0) + %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_1, 1) + %channel_3 = amdaie.channel(%tile_0_2, 1) + %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + amdaie.controlcode { + %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) + %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - memref.dealloc %alloc_0 : memref<32x64xi32> + } + return + } +} + +// ----- + +// CHECK: aie.device(npu1_4col) +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: %[[BUFFER_0_1_4:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_4"} : memref<2048xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_5:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_5"} : memref<2048xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_6:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_6"} : memref<2048xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_7:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_7"} : memref<2048xi32, 1 : i32> +// CHECK: %[[LOCK_0_1_8:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 4 : i8, sym_name = "lock_4"} +// CHECK: %[[LOCK_0_1_9:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_5"} +// CHECK: %[[BUFFER_0_2_10:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_8"} : memref<2048xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_11:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_9"} : memref<2048xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_12:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_10"} : memref<2048xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_13:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_11"} : memref<2048xi32, 2 : i32> +// CHECK: %[[LOCK_0_2_14:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 4 : i8, sym_name = "lock_6"} +// CHECK: %[[LOCK_0_2_15:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_7"} +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 1) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb4, ^bb8) +// CHECK: ^bb4: +// CHECK: aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_4]] : memref<2048xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_8]], Release, 1) +// CHECK: aie.next_bd ^bb5 +// CHECK: ^bb5: +// CHECK: aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_5]] : memref<2048xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_8]], Release, 1) +// CHECK: aie.next_bd ^bb6 +// CHECK: ^bb6: +// CHECK: aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_6]] : memref<2048xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_8]], Release, 1) +// CHECK: aie.next_bd ^bb7 +// CHECK: ^bb7: +// CHECK: aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_7]] : memref<2048xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_8]], Release, 1) +// CHECK: aie.next_bd ^bb4 +// CHECK: ^bb8: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb8) +// CHECK: ^bb4: +// CHECK: aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_10]] : memref<2048xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_15]], Release, 1) +// CHECK: aie.next_bd ^bb5 +// CHECK: ^bb5: +// CHECK: aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_11]] : memref<2048xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_15]], Release, 1) +// CHECK: aie.next_bd ^bb6 +// CHECK: ^bb6: +// CHECK: aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_12]] : memref<2048xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_15]], Release, 1) +// CHECK: aie.next_bd ^bb7 +// CHECK: ^bb7: +// CHECK: aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_13]] : memref<2048xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_15]], Release, 1) +// CHECK: aie.next_bd ^bb4 +// CHECK: ^bb8: +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @multi_connection_multi_buffer +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @multi_connection_multi_buffer() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 2) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_2 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %buffer_3 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(0), 2) + %lock_3 = amdaie.lock(%tile_0_2(1), 0) + %buffer_4 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_5 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_6 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_7 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %lock_4 = amdaie.lock(%tile_0_1(0), 4) + %lock_5 = amdaie.lock(%tile_0_1(1), 0) + %buffer_8 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32> + %buffer_9 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32> + %buffer_10 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32> + %buffer_11 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32> + %lock_6 = amdaie.lock(%tile_0_2(0), 4) + %lock_7 = amdaie.lock(%tile_0_2(1), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> + %channel = amdaie.channel(%tile_0_1, 0) + %channel_1 = amdaie.channel(%tile_0_2, 0) + %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> + %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> + %channel_2 = amdaie.channel(%tile_0_1, 1) + %channel_3 = amdaie.channel(%tile_0_2, 1) + %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) + %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1]) + %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -363,19 +541,51 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a -// core, it's hard to create a very small minimal test. -// -// CHECK: aie.device -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %{{.+}} = aie.tile(0, 1) -// CHECK-DAG: %{{.+}} = aie.tile(0, 0) -// CHECK: aie.core(%[[TILE_0_2]]) -// CHECK: aie.objectfifo.release -// CHECK: aiex.runtime_sequence @tile_and_core_and_release +// CHECK: aie.device(npu1_4col) { +// CHECK: memref.global "public" @shim_0 : memref<4096xi32> +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[TILE_0_0:.*]] = aie.tile(0, 0) +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_1]], DMA : 0) +// CHECK: aie.shim_dma_allocation @shim_0(MM2S, 0, 0) +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {len = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_0]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb3, ^bb4) +// CHECK: ^bb3: +// CHECK: aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 1024 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb3 +// CHECK: ^bb4: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb2: +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @single_connection_chain #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @tile_and_core_and_release() { + func.func @single_connection_chain() { amdaie.workgroup { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -383,24 +593,24 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_0 = memref.alloc() : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0]) { - amdaie.logicalobjectfifo.release(%conn0, Produce) {size = 1 : i32} - amdaie.end - } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - memref.dealloc %alloc_0 : memref<32x64xi32> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(0), 1) + %lock_3 = amdaie.lock(%tile_0_2(1), 0) + %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_0, 0) + %channel_1 = amdaie.channel(%tile_0_1, 0) + %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %channel_2 = amdaie.channel(%tile_0_1, 0) + %channel_3 = amdaie.channel(%tile_0_2, 0) + %4 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) + %5 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [0, 1024] [32, 32] [64, 1]) + %6 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [0, 1024] [32, 32] [64, 1]) amdaie.end } } @@ -410,6 +620,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +//===----------------------------------------------------------------------===// +// Controlcode tests +//===----------------------------------------------------------------------===// + #pipeline_layout = #hal.pipeline.layout]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -418,31 +632,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%placeholder, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_0, 0) + %channel_1 = amdaie.channel(%tile_0_1, 0) + %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a target BD ID op to lower to the AIE dialect}} - %npu_dma_2 = amdaie.npu.dma_cpy_nd %conn1(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_2, S2MM) + %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}} + %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -453,8 +659,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK: aie.device -// CHECK: aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<32x64xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) +// CHECK: aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<4096xi32> +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) { +// CHECK-SAME: id = 0 : i64 #pipeline_layout = #hal.pipeline.layout]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -463,27 +670,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %conn1 = amdaie.connection(%placeholder, %obj1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_0, 0) + %channel_1 = amdaie.channel(%tile_0_1, 0) + %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] []) - %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] []) - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %npu_dma_2 = amdaie.npu.dma_cpy_nd %conn1(%obj0[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] []) + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -495,73 +697,64 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Test to show mix of implicit/explicit source/target addressing in amdaie.npu.dma_cpy_nd. -// CHECK: aie.device -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_0_0:.+]] = aie.tile(0, 0) -// CHECK: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]], {%[[TILE_0_1]]} -// CHECK-NEXT: aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]} -// CHECK-NEXT: aie.objectfifo @[[OBJ2:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]} -// CHECK: aie.objectfifo.link [@[[OBJ0]]] -> [@[[OBJ1]]] -// CHECK: aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<32x64xi32> -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: metadata = @[[OBJ1]] -// CHECK-NEXT: aiex.npu.dma_wait {symbol = @[[OBJ1]]} -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: metadata = @[[OBJ1]] -// CHECK-NEXT: aiex.npu.dma_wait {symbol = @[[OBJ1]]} -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: metadata = @[[OBJ2]] -// CHECK-NEXT: aiex.npu.dma_wait {symbol = @[[OBJ2]]} -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: metadata = @[[OBJ2]] -// CHECK-NEXT: aiex.npu.dma_wait {symbol = @[[OBJ2]]} -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +// CHECK: aie.device +// CHECK: memref.global "public" @[[SHIM_1:.+]] : memref<2048xi32> +// CHECK: memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32> +// CHECK: aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<4096xi32>, %[[ARG1:.+]]: memref<2048xi32>) +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} +// CHECK: scf.forall +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} +// CHECK: } #pipeline_layout = #hal.pipeline.layout]> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @controlcode() { - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index amdaie.workgroup { - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %0, 64 : memref<32x64xi32> - %tile = amdaie.tile(%c0, %c0) - %tile_0 = amdaie.tile(%c0, %c1) - %tile_1 = amdaie.tile(%c0, %c2) - %bd_id = amdaie.bd_id(%tile, 0) - %alloc = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_1} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %4 = amdaie.connection(%2, %3) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %5 = amdaie.connection(%1, %2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.connection(%2, %1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%4] -> [%5] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc : memref<32x32xi32, 1> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xi32> + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %lock_2 = amdaie.lock(%tile_0_1(0), 1) + %lock_3 = amdaie.lock(%tile_0_1(1), 0) + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_0, 0) + %channel_1 = amdaie.channel(%tile_0_1, 0) + %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1) + %channel_3 = amdaie.channel(%tile_0_1, 1) + %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { - %7 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [] [] []) - %8 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %6([] [] [], [] [] []) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.npu.dma_cpy_nd %5(%10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%11, S2MM) - %12 = amdaie.npu.dma_cpy_nd %5(%10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, S2MM) - %13 = amdaie.npu.dma_cpy_nd %6([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) + %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> + %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12, MM2S) + %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%13, MM2S) - %14 = amdaie.npu.dma_cpy_nd %6([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14, MM2S) + scf.forall (%arg0, %arg1) in (2, 1) { + %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, S2MM) + %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, S2MM) + } amdaie.end } } @@ -571,71 +764,89 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK: aie.device(npu1_4col) { -// CHECK-DAG: %[[TILE_0_0:.*]] = aie.tile(0, 0) -// CHECK-DAG: %[[TILE_0_1:.*]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_1_0:.*]] = aie.tile(1, 0) -// CHECK: aie.objectfifo @[[OBJ0:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_1_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] -// CHECK-SAME: {%[[TILE_1_0]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aiex.runtime_sequence @bf16_f32_lit_test -// CHECK-SAME: (%[[LHS:.*]]: memref<32x32xbf16>, %[[RHS:.*]]: memref<32x32xbf16>, %[[OUT:.*]]: memref<32x32xf32>) { -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[OUT]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: metadata = @[[OBJ2]] -// CHECK-SAME: memref<32x32xf32> -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[RHS]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1] -// CHECK-SAME: metadata = @[[OBJ1]] -// CHECK-SAME: memref<32x32xbf16> -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[LHS]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1] -// CHECK-SAME: metadata = @[[OBJ0]] -// CHECK-SAME: memref<32x32xbf16> +// CHECK: aie.device +// CHECK: memref.global "public" @[[SHIM_1:.+]] : memref<2048xf32> +// CHECK: memref.global "public" @[[SHIM_0:.+]] : memref<4096xbf16> +// CHECK: aiex.runtime_sequence @controlcode_bf16_f32(%[[ARG0:.+]]: memref<4096xbf16>, %[[ARG1:.+]]: memref<2048xf32>) +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} +#pipeline_layout = #hal.pipeline.layout]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -#pipeline_layout = #hal.pipeline.layout, , ]> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @bf16_f32_lit_test() { - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index + func.func @controlcode_bf16_f32() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xbf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xf32> + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) + %buffer = amdaie.buffer(%tile_0_1) : memref<4096xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0_1(0), 1) + %lock_1 = amdaie.lock(%tile_0_1(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xf32, 1 : i32> + %lock_2 = amdaie.lock(%tile_0_1(0), 1) + %lock_3 = amdaie.lock(%tile_0_1(1), 0) + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel = amdaie.channel(%tile_0_0, 0) + %channel_1 = amdaie.channel(%tile_0_1, 0) + %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> + %channel_2 = amdaie.channel(%tile_0_0, 1) + %channel_3 = amdaie.channel(%tile_0_1, 1) + %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) + amdaie.controlcode { + %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] []) + %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1]) + %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> + %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> + %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12, MM2S) + %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13, MM2S) + %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14, S2MM) + %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, S2MM) + amdaie.end + } + } + return + } +} + +// ----- + +//===----------------------------------------------------------------------===// +// CoreOp tests +//===----------------------------------------------------------------------===// + +// CHECK: aie.device +// CHECK: %[[TILE_0_2:.+]] = aie.tile(0, 2) +// CHECK: aie.core(%[[TILE_0_2]]) { +// CHECK: aie.end +// CHECK: } +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @core() { amdaie.workgroup { - %alloc = memref.alloc() : memref<2x2x16x16xf32, 1 : i32> - %alloc_0 = memref.alloc() : memref<1x2x32x16xbf16, 1 : i32> - %tile = amdaie.tile(%c0, %c1) - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x2x16x16xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> - %2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> - %tile_1 = amdaie.tile(%c0, %c0) - %tile_2 = amdaie.tile(%c1, %c0) - %bd_id = amdaie.bd_id(%tile_1, 2) - %bd_id_3 = amdaie.bd_id(%tile_1, 1) - %bd_id_4 = amdaie.bd_id(%tile_1, 0) - %4 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo> - memref.assume_alignment %3, 64 : memref<32x32xbf16> - %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> - %6 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> - memref.assume_alignment %5, 64 : memref<32x32xbf16> - %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> - %8 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> - %9 = amdaie.connection(%2, %4) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %10 = amdaie.connection(%1, %6) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %11 = amdaie.connection(%8, %0) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %tile_0_2 = amdaie.tile(%c0, %c2) + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) { + amdaie.end + } amdaie.controlcode { - %12 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [] [] []) - %13 = amdaie.npu.circular_dma_cpy_nd %10([] [] [], [] [] []) - %14 = amdaie.npu.circular_dma_cpy_nd %11([] [] [], [0, 0, 0, 0] [2, 16, 2, 16] [512, 16, 256, 1]) - %15 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> - %16 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> - %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> - %18 = amdaie.npu.dma_cpy_nd %11(%17[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_4, [] [] []) : target_type = !amdaie.logicalobjectfifo> - %19 = amdaie.npu.dma_cpy_nd %10([] [] [], %16[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_3) : source_type = !amdaie.logicalobjectfifo> - %20 = amdaie.npu.dma_cpy_nd %9([] [] [], %15[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%18, S2MM) - amdaie.npu.dma_wait(%19, MM2S) - amdaie.npu.dma_wait(%20, MM2S) amdaie.end } } @@ -645,102 +856,285 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK: aie.device -// CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) -// CHECK-DAG: %[[TILE_0_0:.+]] = aie.tile(0, 0) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]} -// CHECK-NEXT: aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_2]], %[[TILE_1_2]]} -// CHECK-NEXT: aie.objectfifo.link -// CHECK-SAME: @[[OBJ0]] -// CHECK-SAME: @[[OBJ1]] -// CHECK: aie.core(%[[TILE_0_2]]) -// CHECK: %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) -// CHECK: %[[ACCESS_0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_0]] -// CHECK: %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[ACCESS_0]] -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: linalg.fill -// CHECK-SAME: %[[REINTERPRET_0]] -// CHECK: } -// CHECK: aie.objectfifo.release -// CHECK-SAME: @[[OBJ1]] -// CHECK: aie.core(%[[TILE_1_2]]) -// CHECK: %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) -// CHECK: %[[ACCESS_1:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_1]] -// CHECK: %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[ACCESS_1]] -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: linalg.fill -// CHECK-SAME: %[[REINTERPRET_1]] -// CHECK: } -// CHECK: aie.objectfifo.release -// CHECK-SAME: @[[OBJ1]] -// CHECK: aiex.runtime_sequence @large_example -// CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK-SAME: %[[ARG0]] -// CHECK-SAME: [0, 0, 0, 32] -// CHECK-SAME: [1, 1, 32, 32] -// CHECK-SAME: [0, 0, 64, 1] -// CHECK-SAME: issue_token = true -// CHECK-SAME: @[[OBJ0]] -// CHECK-NEXT: aiex.npu.dma_wait -// CHECK-SAME: @[[OBJ0]] +// CHECK: aie.device +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32 +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_0"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_2_0:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) { +// CHECK: aie.use_lock(%[[LOCK_0_2_0]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>) +// CHECK: aie.use_lock(%[[LOCK_0_2]], Release, 1) +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @core_acquire_release +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @core_acquire_release() { + amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %tile_0_2 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock = amdaie.lock(%tile_0_2(0), 1) + %lock_1 = amdaie.lock(%tile_0_2(1), 0) + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) { + amdaie.use_lock(%lock_1, AcquireGreaterOrEqual(1)) + %3 = memref.reinterpret_cast %buffer to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<64x64xi32, 2 : i32>) + amdaie.use_lock(%lock, Release(1)) + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: aie.device +// CHECK: func.func private @ukernel_B(memref, index, memref, index) attributes {llvm.bareptr = true} +// CHECK: func.func private @ukernel_A(memref, index) attributes {llvm.bareptr = true} +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_0"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_2_0:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xf32, 2 : i32> +// CHECK: %[[LOCK_0_2_2:.*]] = aie.lock(%[[TILE_0_2]], 2) {init = 1 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 3) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) { +// CHECK: aie.use_lock(%[[LOCK_0_2_0]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST_4:.*]] = memref.reinterpret_cast %[[BUFFER_0_2_1]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xf32, 2 : i32> to memref<64x64xf32, 2 : i32> +// CHECK: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32> -> memref, index, index, index, index, index +// CHECK: %[[BASE_BUFFER_5:.*]], %[[OFFSET_6:.*]], %[[SIZES_7:.*]]:2, %[[STRIDES_8:.*]]:2 = memref.extract_strided_metadata %[[REINTERPRET_CAST_4]] : memref<64x64xf32, 2 : i32> -> memref, index, index, index, index, index +// CHECK: func.call @ukernel_A(%[[BASE_BUFFER]], %[[C0]]) : (memref, index) -> () +// CHECK: func.call @ukernel_B(%[[BASE_BUFFER]], %[[C0]], %[[BASE_BUFFER_5]], %[[C0]]) : (memref, index, memref, index) -> () +// CHECK: aie.use_lock(%[[LOCK_0_2]], Release, 1) +// CHECK: aie.use_lock(%[[LOCK_0_2_2]], Release, 1) +// CHECK: aie.end +// CHECK: } {link_with = "/path/to/ukernel.o"} +// CHECK: aiex.runtime_sequence @core_ukernel +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func private @ukernel_A(memref, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true} + func.func private @ukernel_B(memref, index, memref, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true} + func.func @core_ukernel() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %tile_0_2 = amdaie.tile(%c0, %c2) + %buffer = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32> + %lock = amdaie.lock(%tile_0_2(0), 1) + %lock_1 = amdaie.lock(%tile_0_2(1), 0) + %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xf32, 2 : i32> + %lock_2 = amdaie.lock(%tile_0_2(2), 1) + %lock_3 = amdaie.lock(%tile_0_2(3), 0) + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) { + amdaie.use_lock(%lock_1, AcquireGreaterOrEqual(1)) + %3 = memref.reinterpret_cast %buffer to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + amdaie.use_lock(%lock_3, AcquireGreaterOrEqual(1)) + %4 = memref.reinterpret_cast %buffer_1 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xf32, 2 : i32> to memref<64x64xf32, 2 : i32> + %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %3 : memref<64x64xi32, 2 : i32> -> memref, index, index, index, index, index + %base_buffer0, %offset0, %sizes0:2, %strides0:2 = memref.extract_strided_metadata %4 : memref<64x64xf32, 2 : i32> -> memref, index, index, index, index, index + func.call @ukernel_A(%base_buffer, %c0) : (memref, index) -> () + func.call @ukernel_B(%base_buffer, %c0, %base_buffer0, %c0) : (memref, index, memref, index) -> () + amdaie.use_lock(%lock, Release(1)) + amdaie.use_lock(%lock_2, Release(1)) + amdaie.end + } {link_with = "/path/to/ukernel.o"} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +//===----------------------------------------------------------------------===// +// Larger tests +//===----------------------------------------------------------------------===// + +// CHECK: aie.device(npu1_4col) { +// CHECK: memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32> +// CHECK: %[[TILE_1_2:.*]] = aie.tile(1, 2) +// CHECK: %[[TILE_0_2:.*]] = aie.tile(0, 2) +// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK: %[[TILE_0_0:.*]] = aie.tile(0, 0) +// CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32 +// CHECK: %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32> +// CHECK: %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32> +// CHECK: %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"} +// CHECK: %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"} +// CHECK: %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32> +// CHECK: %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"} +// CHECK: %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"} +// CHECK: %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_4"} : memref<4096xi32, 2 : i32> +// CHECK: %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_5"} : memref<4096xi32, 2 : i32> +// CHECK: %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]], 0) {init = 2 : i8, sym_name = "lock_4"} +// CHECK: %[[LOCK_1_2_5:.*]] = aie.lock(%[[TILE_1_2]], 1) {init = 0 : i8, sym_name = "lock_5"} +// CHECK: aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_1]], DMA : 0) +// CHECK: aie.shim_dma_allocation @[[SHIM_0]](MM2S, 0, 0) +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 0) +// CHECK: aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_1_2]], DMA : 0) +// CHECK: %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) { +// CHECK: %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 4096 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 4096 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb4, ^bb6) +// CHECK: ^bb4: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 4096 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb5 +// CHECK: ^bb5: +// CHECK: aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie, ]>, len = 4096 : i32, offset = 1024 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_1]], Release, 1) +// CHECK: aie.next_bd ^bb4 +// CHECK: ^bb6: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) { +// CHECK: %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) { +// CHECK: %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: +// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_1_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_1_2_5]], Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: +// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%[[BUFFER_1_2_4]] : memref<4096xi32, 2 : i32>) {len = 0 : i32} +// CHECK: aie.use_lock(%[[LOCK_1_2_5]], Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: +// CHECK: aie.end +// CHECK: } +// CHECK: %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) { +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>) +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST_6:.*]] = memref.reinterpret_cast %[[BUFFER_0_2_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST_6]] : memref<64x64xi32, 2 : i32>) +// CHECK: aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1) +// CHECK: aie.end +// CHECK: } +// CHECK: %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) { +// CHECK: aie.use_lock(%[[LOCK_1_2_5]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_1_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>) +// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%[[LOCK_1_2_5]], AcquireGreaterEqual, 1) +// CHECK: %[[REINTERPRET_CAST_6:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_4]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> +// CHECK: linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST_6]] : memref<64x64xi32, 2 : i32>) +// CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) +// CHECK: aie.end +// CHECK: } +// CHECK: aiex.runtime_sequence @large_example(%[[ARG0:.*]]: memref<4096xi32>) { +// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> +// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} +// CHECK: } +// CHECK: } +#pipeline_layout = #hal.pipeline.layout]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -#pipeline_layout = #hal.pipeline.layout]> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @large_example() { - %c8 = arith.constant 8 : index - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> %tile = amdaie.tile(%c0, %c0) %tile_0 = amdaie.tile(%c0, %c1) %tile_1 = amdaie.tile(%c0, %c2) %tile_2 = amdaie.tile(%c1, %c2) %bd_id = amdaie.bd_id(%tile, 0) - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %0, 64 : memref<32x64xi32> - %alloc = memref.alloc() : memref<32x32xi32, 1> - %alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2> - %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1, %tile_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %4 = amdaie.connection(%2, %1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %5 = amdaie.connection(%3, %2) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%4] -> [%5] () + %buffer = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 2) + %lock_4 = amdaie.lock(%tile_0(1), 0) + %buffer_5 = amdaie.buffer(%tile_1) : memref<4096xi32, 2 : i32> + %buffer_6 = amdaie.buffer(%tile_1) : memref<4096xi32, 2 : i32> + %lock_7 = amdaie.lock(%tile_1(0), 2) + %lock_8 = amdaie.lock(%tile_1(1), 0) + %buffer_9 = amdaie.buffer(%tile_2) : memref<4096xi32, 2 : i32> + %buffer_10 = amdaie.buffer(%tile_2) : memref<4096xi32, 2 : i32> + %lock_11 = amdaie.lock(%tile_2(0), 2) + %lock_12 = amdaie.lock(%tile_2(1), 0) + %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> + %channel = amdaie.channel(%tile, 0) + %channel_13 = amdaie.channel(%tile_0, 0) + %4 = amdaie.connection(%2 {%channel_13}, %1 {%channel}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %channel_14 = amdaie.channel(%tile_0, 1) + %channel_15 = amdaie.channel(%tile_1, 0) + %channel_16 = amdaie.channel(%tile_2, 0) + %5 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) %6 = amdaie.core(%tile_1, in : [%5], out : []) { - %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> - scf.for %arg0 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>) - } - amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32} + amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1)) + %reinterpret_cast = memref.reinterpret_cast %buffer_5 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) + amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1)) + amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1)) + %reinterpret_cast_17 = memref.reinterpret_cast %buffer_6 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast_17 : memref<64x64xi32, 2 : i32>) + amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1)) amdaie.end } %7 = amdaie.core(%tile_2, in : [%5], out : []) { - %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> - scf.for %arg0 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>) - } - amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32} + amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1)) + %reinterpret_cast = memref.reinterpret_cast %buffer_9 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>) + amdaie.use_lock(%lock_11, AcquireGreaterOrEqual(1)) + amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1)) + %reinterpret_cast_17 = memref.reinterpret_cast %buffer_10 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast_17 : memref<64x64xi32, 2 : i32>) + amdaie.use_lock(%lock_11, AcquireGreaterOrEqual(1)) amdaie.end } - memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc : memref<32x32xi32, 1> amdaie.controlcode { - %8 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [] [] []) - %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [] [] []) - %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + %8 = amdaie.npu.circular_dma_cpy_nd %4([0, 0] [64, 64] [32, 1], [] [] []) + %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [64, 64] [32, 1]) + %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> + %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%11, MM2S) amdaie.end } @@ -748,4 +1142,3 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } - diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt index 4a66d5863..4a909e30a 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt @@ -24,5 +24,6 @@ iree_cc_library( MLIRIR MLIRParser MLIRSupport + iree-amd-aie::aie_runtime::iree_aie_runtime_static PUBLIC )