From 71e17ed31f0182801894d351a6ad5f112be8efc9 Mon Sep 17 00:00:00 2001 From: Zhewen Yu Date: Wed, 11 Dec 2024 11:54:11 +0000 Subject: [PATCH] Move AMDAIEAssignChannelsPass before AMDAIEAssignNpuDmaBdIdsPass (#980) This PR is to resolve the following comment: https://github.com/nod-ai/iree-amd-aie/blob/e623a6a36e7b92b46501f7f1c952b114da990a6a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp#L269-L274 - passes are now reordered - add a utility function to retrieve `ChannelOp` from given `NpuDmaCpyNdOp` - unit test `assign-npu-dma-bd-ids.mlir` is refactored --- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp | 22 ++ .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 4 + .../Transforms/AMDAIEAssignNpuDmaBdIds.cpp | 43 +-- .../iree-amd-aie/Transforms/Passes.cpp | 8 +- .../test/assign_npu_dma_bd_ids.mlir | 313 ++++++++++-------- 5 files changed, 225 insertions(+), 165 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index d98607dfa..72ff124af 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -1066,6 +1066,28 @@ bool NpuDmaCpyNdOp::hasDmaWaitOpUser() { [](auto userOp) { return isa(userOp); }); } +FailureOr NpuDmaCpyNdOp::getSourceChannelOp() { + AMDAIE::ConnectionOp connectionOp = getConnectionOp(); + if (!connectionOp) + return emitOpError() << "should operate on an `amdaie.connection` op"; + if (connectionOp.getSourceChannels().size() != 1) + return emitOpError() << "expected a single source channel"; + auto sourceChannelOp = dyn_cast( + connectionOp.getSourceChannels()[0].getDefiningOp()); + return sourceChannelOp; +} + +FailureOr NpuDmaCpyNdOp::getTargetChannelOp() { + AMDAIE::ConnectionOp connectionOp = getConnectionOp(); + if (!connectionOp) + return emitOpError() << "should operate on an `amdaie.connection` op"; + if (connectionOp.getTargetChannels().size() != 1) + return emitOpError() << "expected a single target channel"; + auto targetChannelOp = dyn_cast( + connectionOp.getTargetChannels()[0].getDefiningOp()); + return targetChannelOp; +} + namespace { struct NpuDmaCpyNdOpReplacementBuilder { static void replace(NpuDmaCpyNdOp dmaOp, PatternRewriter &rewriter, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index fa45c468e..36985c8b9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -558,6 +558,10 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ if (!bdIdValue) return nullptr; return dyn_cast_if_present(bdIdValue.getDefiningOp()); } + + FailureOr getSourceChannelOp(); + + FailureOr getTargetChannelOp(); // A utility to create a new doubly strided operation from this one with a // new set of source and target offsets, sizes and strides. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp index 872eceac2..4c751d6d6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp @@ -146,13 +146,27 @@ template FailureOr getBdIdOp( IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp, DenseMap &shimTileToGeneratorMap, - DenseMap> &bdIdOpToBdIdsMap, - uint32_t channel) { - FailureOr tileOp = + DenseMap> &bdIdOpToBdIdsMap) { + // Get the TileOp. + FailureOr maybeTileOp = getGeneratorTileOp(npuDmaOp, shimTileToGeneratorMap); - if (failed(tileOp)) return failure(); + if (failed(maybeTileOp)) return failure(); + AMDAIE::TileOp tileOp = maybeTileOp.value(); + + // Get the channel. + FailureOr maybeChannelOp; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + maybeChannelOp = npuDmaOp.getSourceChannelOp(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + maybeChannelOp = npuDmaOp.getTargetChannelOp(); + } else { + return npuDmaOp.emitOpError() + << "Function can only operate on Source or Target"; + } + if (failed(maybeChannelOp)) return failure(); + uint32_t channel = maybeChannelOp.value().getValue(); - ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()]; + ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp.getResult()]; rewriter.setInsertionPoint(npuDmaOp); if (scf::ForOp loop = npuDmaOp->getParentOfType(); loop && getNumberIterations(loop)) { @@ -165,7 +179,7 @@ FailureOr getBdIdOp( // Get the number of BD IDs will be assigned to current DMA op. uint32_t numRequired = 0; - getNumRequiredBdIds(loop, npuDmaOp, *tileOp, shimTileToGeneratorMap, + getNumRequiredBdIds(loop, npuDmaOp, tileOp, shimTileToGeneratorMap, numRequired); uint32_t numAvailable = generator.getNumAvailableBdIds(channel); uint32_t size = std::max(numAvailable / numRequired, 1u); @@ -193,7 +207,7 @@ FailureOr getBdIdOp( iv, }); AMDAIE::BdIdOp bdIdOp = rewriter.create( - rewriter.getUnknownLoc(), *tileOp, affineApply.getResult()); + rewriter.getUnknownLoc(), tileOp, affineApply.getResult()); bdIdOpToBdIdsMap[bdIdOp] = bdIds; return bdIdOp; } @@ -206,7 +220,7 @@ FailureOr getBdIdOp( auto constant = rewriter.create( rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value())); AMDAIE::BdIdOp bdIdOp = rewriter.create( - rewriter.getUnknownLoc(), *tileOp, constant.getResult()); + rewriter.getUnknownLoc(), tileOp, constant.getResult()); return bdIdOp; }; @@ -266,13 +280,6 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { } }); - // TODO(jornt): Temporarily use channel 0 for all DMAs. This should - // return correct results for Shim channels, however, for generality - // towards other DMAs and future hardware generations, channel - // assignment should happen before BD assignemnt. This requires more - // refactoring. - const uint32_t channel = 0; - DenseMap> bdIdOpToBdIdsMap; // Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign // and release BD IDs when encountering the respective operations using the @@ -282,8 +289,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { if (auto npuDmaOp = dyn_cast(op)) { if (npuDmaOp.getSource()) { FailureOr bdIdOp = getBdIdOp( - rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap, - channel); + rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap); if (failed(bdIdOp)) return WalkResult::interrupt(); rewriter.setInsertionPoint(npuDmaOp); npuDmaOp = rewriter.replaceOpWithNewOp( @@ -296,8 +302,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { } if (npuDmaOp.getTarget()) { FailureOr bdIdOp = getBdIdOp( - rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap, - channel); + rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap); if (failed(bdIdOp)) return WalkResult::interrupt(); rewriter.setInsertionPoint(npuDmaOp); (void)rewriter.replaceOpWithNewOp( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 9ece915fe..4d91808c4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -635,6 +635,10 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCSEPass()); + passManager.addPass(createAMDAIEAssignChannelsPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); @@ -650,10 +654,6 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEConvertCoreForallToForPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIEAssignChannelsPass()); - passManager.addPass(createCSEPass()); - passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIEObjFifoBufferizationPass()); passManager.addPass(createAMDAIETemporaryAllocBufferizationPass()); passManager.addPass(createAMDAIEConnectionToFlowPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir index ed3464aaa..785a20ff7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir @@ -11,15 +11,15 @@ module { // ----- +// Expect constant BD ID 0 is assigned to the DMA copy operation. + // CHECK-LABEL: @single_dma_cpy_nd_on_source // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_source_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -29,13 +29,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) amdaie.end } } @@ -45,15 +47,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect constant BD ID 0 is assigned to the DMA copy operation. + // CHECK-LABEL: @single_dma_cpy_nd_on_target // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_target_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -63,13 +65,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %connection = amdaie.connection(%placeholder {%channel_0}, %from_memref_0 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd async_target %0(%from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%1 : !amdaie.async_target_token) + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_target %connection(%from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%0 : !amdaie.async_target_token) amdaie.end } } @@ -79,27 +83,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect all DMA copy operations are assigned with constant BD ID 0, because they are all on different shim tiles. + // CHECK-LABEL: @multiple_dma_cpy_on_diff_tiles -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) @@ -114,20 +114,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> - %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_2 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_4 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) + %channel_5 = amdaie.channel(%tile_0_1, 2, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder_0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder_2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> + %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder_0 {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_1 = amdaie.connection(%from_memref_0 {%channel_4}, %placeholder_1 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder_2 {%channel_2}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> - %1 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> - %2 = amdaie.npu.dma_cpy_nd async_source %dma2([] [] [], %from_memref_2[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) @@ -140,23 +146,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect BD IDs: 0, 1, 2 are assigned to the DMA copy operations, as incremental assignment is used. + // CHECK-LABEL: @multiple_dma_cpy_with_wait_after_each // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -166,17 +172,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) + %1 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) - %2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) - %3 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%3 : !amdaie.async_source_token) amdaie.end } } @@ -186,21 +194,21 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect BD IDs: 0, 1, 2 are assigned to the DMA copy operations, as incremental assignment is used and IDs are only release after waits. + // CHECK-LABEL: @multiple_dma_cpy_with_wait_after_all // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup // CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) @@ -212,17 +220,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> - %2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> - %3 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) - amdaie.npu.dma_wait(%3 : !amdaie.async_source_token) amdaie.end } } @@ -231,40 +241,37 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} } // ----- + +// Expect two DMA copy operations at the innermost loop have BD IDs as expressions. #map0: 1~15, #map1: 0~15 + // CHECK: #map = affine_map<(d0) -> (d0 mod 15 + 1)> // CHECK: #map1 = affine_map<(d0) -> (d0 mod 16)> // CHECK-LABEL: @nested_loops_multi_tiles -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) // CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) // CHECK: %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) // CHECK: scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]] // CHECK: %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]]) // CHECK: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[VAR_0]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) // CHECK: %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]]) -// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]]) +// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_0_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token) // CHECK: %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_0]]) // CHECK: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[VAR_1]]) -// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]]) +// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token) // CHECK: } // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) @@ -282,26 +289,32 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> - %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_2 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_4 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) + %channel_5 = amdaie.channel(%tile_0_1, 2, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder_0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder_2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> + %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder_0 {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_1 = amdaie.connection(%from_memref_0 {%channel_4}, %placeholder_1 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder_2 {%channel_2}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.forall (%arg4, %arg5) in (2, 2) { - %1 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.for %arg6 = %c0 to %c6 step %c1 { - %2 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo> - %3 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo> + %3 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) amdaie.npu.dma_wait(%3 : !amdaie.async_source_token) - %4 = amdaie.npu.dma_cpy_nd async_source %dma2([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo> + %4 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%4 : !amdaie.async_source_token) } amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) @@ -316,33 +329,32 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect all three DMA copy operations have BD IDs as expressions. #map0: 0~15, #map1: 0~7, #map2: 8~15 +// BD IDs used by #map0 are released before the innermost loop, so that they can be reused by #map1 and #map2. + // CHECK: #map = affine_map<(d0) -> (d0 mod 16)> // CHECK: #map1 = affine_map<(d0) -> (d0 mod 8)> // CHECK: #map2 = affine_map<(d0) -> (d0 mod 8 + 8)> // CHECK-LABEL: @nested_loops_wait_before_innerloop -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK: amdaie.workgroup -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: amdaie.controlcode // CHECK: scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] // CHECK: %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]]) // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][] [] [] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: scf.for %[[LOOP_VAR_1:.+]] = %[[C0]] to %[[C2]] step %[[C1]] // CHECK: %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_1]]) // CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA_0]](%[[FROM_MEMREF_1]][] [] [] bd_id = %[[BD_ID_1]], [] [] []) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[] [] [] bd_id = %[[BD_ID_1]], [] [] []) // CHECK: %[[VAR_2:.+]] = affine.apply #map2(%[[LOOP_VAR_1]]) // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_target_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) // CHECK: } @@ -356,19 +368,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %c4 = arith.constant 4 : index amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_5 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_1 = amdaie.connection(%placeholder {%channel_1}, %from_memref_0 {%channel_4}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder {%channel_2}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> scf.for %arg4 = %c0 to %c4 step %c1 { - %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[] [] []) : source_type = !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) scf.for %arg5 = %c0 to %c2 step %c1 { - %1 = amdaie.npu.dma_cpy_nd async_target %dma0(%from_memref_1[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo> - %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1 : !amdaie.async_target_token) amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) } @@ -382,32 +403,31 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect all three DMA copy operations have BD IDs as expressions. #map0: 0~1, #map1: 2~8, #map2: 9~15. +// BD IDs used by #map0 are released after the innermost loop, so that they cannot be reused by #map1 and #map2. + // CHECK: #map = affine_map<(d0) -> (d0 mod 2)> // CHECK: #map1 = affine_map<(d0) -> (d0 mod 7 + 2)> // CHECK: #map2 = affine_map<(d0) -> (d0 mod 7 + 9)> // CHECK-LABEL: @nested_loops_wait_after_innerloop -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK: amdaie.workgroup -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: amdaie.controlcode // CHECK: scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C4]] step %[[C1]] // CHECK: %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]]) // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]]) -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][] [] [] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_0]]) // CHECK: scf.for %[[LOOP_VAR_1:.+]] = %[[C0]] to %[[C2]] step %[[C1]] // CHECK: %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_1]]) // CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA_0]](%[[FROM_MEMREF_1]][] [] [] bd_id = %[[BD_ID_1]], [] [] []) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[] [] [] bd_id = %[[BD_ID_1]], [] [] []) // CHECK: %[[VAR_2:.+]] = affine.apply #map2(%[[LOOP_VAR_1]]) // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_target_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) // CHECK: } @@ -422,18 +442,27 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %c4 = arith.constant 4 : index amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> - %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_5 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo, 2> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder {%channel_0}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %connection_1 = amdaie.connection(%placeholder {%channel_1}, %from_memref_0 {%channel_4}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder {%channel_2}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> scf.for %arg4 = %c0 to %c4 step %c1 { - %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[] [] []) : source_type = !amdaie.logicalobjectfifo> + %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[] [] []) : source_type = !amdaie.logicalobjectfifo> scf.for %arg5 = %c0 to %c2 step %c1 { - %1 = amdaie.npu.dma_cpy_nd async_target %dma0(%from_memref_1[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo> - %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1 : !amdaie.async_target_token) amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) }