diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp index e1db37693..4b6a401d0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp @@ -73,9 +73,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass srcStrides = getPrepended(srcStrides, zero); std::optional maybeSwapIndex = verifyAndGetZeroStrideIndex(srcSizes, srcStrides, dmaOp); - if (!maybeSwapIndex.has_value()) { - return WalkResult::interrupt(); - } + if (!maybeSwapIndex.has_value()) return WalkResult::interrupt(); uint32_t swapIndex = maybeSwapIndex.value(); bubble(srcOffsets, swapIndex); bubble(srcSizes, swapIndex); @@ -94,9 +92,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass tgtStrides = getPrepended(tgtStrides, zero); std::optional maybeSwapIndex = verifyAndGetZeroStrideIndex(tgtSizes, tgtStrides, dmaOp); - if (!maybeSwapIndex.has_value()) { - return WalkResult::interrupt(); - } + if (!maybeSwapIndex.has_value()) return WalkResult::interrupt(); uint32_t swapIndex = maybeSwapIndex.value(); bubble(tgtOffsets, swapIndex); bubble(tgtSizes, swapIndex); @@ -114,9 +110,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass return WalkResult::advance(); }); - if (walkResult.wasInterrupted()) { - return signalPassFailure(); - } + if (walkResult.wasInterrupted()) return signalPassFailure(); } private: @@ -133,9 +127,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass static size_t getLowestIndexMaybeAboveOne(ArrayRef v) { for (size_t i = 0; i < v.size(); i++) { std::optional maybe = getConstantIntValue(v[i]); - if (!maybe.has_value() || maybe.value() > 1) { - return i; - } + if (!maybe.has_value() || maybe.value() > 1) return i; } return v.size(); } @@ -143,9 +135,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass static size_t getHighestIndexMaybeZero(ArrayRef v) { for (size_t i = v.size(); i > 0; i--) { std::optional maybe = getConstantIntValue(v[i - 1]); - if (!maybe.has_value() || maybe.value() == 0) { - return i - 1; - } + if (!maybe.has_value() || maybe.value() == 0) return i - 1; } return 0; } @@ -161,7 +151,7 @@ class AMDAIECanonicalizeNpuDmaCpyNdPass size_t lastZeroStrideDim = getHighestIndexMaybeZero(strides); if (firstNonUnitDim < lastZeroStrideDim) { - // Limitation until AIE-4. + // HW limitation. dmaOp.emitOpError("might have stride=0 in dimension ") << lastZeroStrideDim << ", and size>1 in dimension " << firstNonUnitDim << ". As " << firstNonUnitDim << " < " diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 22d5e50a9..663a4a552 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -549,9 +549,10 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, uint32_t bdId = bdIdOp.getValue(); - if (!objFifo) + if (!objFifo) { return dmaOp.emitError() << "input isn't mapped to an `aie.objectifo` operation"; + } if (!offsets.empty() || !sizes.empty() || !strides.empty()) { // Not doing now as better to just eliminate use of aiex dialect diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp index 77728eb2e..f6116a144 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp @@ -24,7 +24,6 @@ namespace mlir::iree_compiler::AMDAIE { namespace { bool sinkInto(AMDAIE::CoreOp coreOp, PatternRewriter &rewriter) { - // Record if any ops are sunk into the core during this iteration. bool changed = false; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index ddcddbdd4..59a875330 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -215,6 +215,7 @@ std::unique_ptr createAMDAIEPadPass(AMDAIEPadOptions options = {}); std::unique_ptr createAMDAIEPeelForLoopPass( AMDAIEPeelForLoopOptions options = {}); +/// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); /// Create pass to tile TilingInterface operations. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 40e1bae87..9f6560870 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -99,7 +99,7 @@ def AMDAIECanonicalizeNpuDmaCpyNd : let description = [{ Canonicalize the offsets/sizes/strides of npu.dma_cpy_nd operations on the L3 side of the data movement, to make them more representative of the DMA in hardware. - This pass ensures they offsets/sizes/strides are of size 4, and that no + This pass ensures the offsets/sizes/strides are of size `nbDimensions`, and that no dimensions with size>1 have stride=0 except for dimension zero (outer dimension). This is a HW constraint. }]; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir index 7b5aa94eb..f10ff8444 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir @@ -70,3 +70,39 @@ module { return } } + +// ----- + +module { + // CHECK-LABEL: dont_sink_amdaie_ops + // The 2 tiles, 2 logicalobjectfifos, and 1 dma_cpy_nd: + // CHECK-COUNT-5: amdaie + // CHECK: amdaie.core + // The logicalobjectfifo.access: + // CHECK-COUNT-1: amdaie + // CHECK: amdaie.end + func.func @dont_sink_amdaie_ops() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<32x1024xi32, 1> + %alloc_0 = memref.alloc() : memref<32x64xi32, 2> + scf.forall (%arg0, %arg1) in (1, 1) { + %tile = amdaie.tile(%c0, %c1) + %tile_1 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0] [0, 0] [0, 0]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_1, in : [%2], out : []) { + %c0_i32 = arith.constant 0 : i32 + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<32x64xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<32x64xi32, 2>) + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<32x64xi32, 2> + memref.dealloc %alloc : memref<32x1024xi32, 1> + return + } +} +