From e623a6a36e7b92b46501f7f1c952b114da990a6a Mon Sep 17 00:00:00 2001 From: Vivian Date: Tue, 10 Dec 2024 08:50:44 -0800 Subject: [PATCH] [DistributeL1Allocations] Include cases when the offset is a user of affineApplyOp (#975) Without this fix, the L1 allocation in the new added test would fail to distribute. --- .../AMDAIEDistributeL1Allocations.cpp | 23 +++++++++++++----- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 7 +++--- .../test/distribute_l1_allocations.mlir | 24 +++++++++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp index 0af3e8b32..6deab3c53 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp @@ -53,16 +53,27 @@ MemRefType getDistributedType(memref::AllocOp alloc, MemRefType type; for (Operation *allocUser : alloc->getUsers()) { if (auto subview = dyn_cast(allocUser)) { - // Check that all offsets are either constants or thread ids. We assume - // that if a subview has an offset which is not a constant and not a - // thread id, it's not 'distributing'. + // Check that all offsets are either constants or depending on thread ids. + // We assume that if a subview has an offset which is not a constant and + // does not depend on thread id, it's not 'distributing'. Operation::operand_range offsets = subview.getOffsets(); int nIndVars{0}; for (Value offset : offsets) { bool isConst = matchPattern(offset, m_Constant()); - bool isIndVar = llvm::is_contained(indVars, offset); - nIndVars += isIndVar; - if (!isConst && !isIndVar) return {}; + bool dependsOnIndVar = false; + if (!isConst && + isa_and_present(offset.getDefiningOp())) { + auto applyOp = cast(offset.getDefiningOp()); + for (auto operand : applyOp.getSymbolOperands()) { + dependsOnIndVar = llvm::is_contained(indVars, operand); + if (dependsOnIndVar) break; + } + } else { + dependsOnIndVar = llvm::is_contained(indVars, offset); + } + + nIndVars += dependsOnIndVar; + if (!isConst && !dependsOnIndVar) return {}; } // If there are no thread ids, this subview is not distributing. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 9d01a5bf5..171326d82 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -219,14 +219,13 @@ def AMDAIEDistributeL1Allocations : Pass<"iree-amdaie-distribute-l1-allocations", "ModuleOp"> { let summary = "Replace distributed L1 allocations with private allocations."; let description = [{ - Each AIE core/tile is uniquely identified by gpu thread ids, usually - 'y' (for AIE row) and 'x' (for AIE column). - Some of the compilation pipelines in iree-amd-aie generate a single L1 memory allocation describing the concatenation of all memory for all cores/tiles. Each thread then slices into a mutually exclusive rectangle of the allocation, along its thread dimensions, so 'privatizing' its - memory. + memory. Note the thread dimension is specified by the `GPUThreadMappingAttr` + in `scf.forall` ops, indicating that the distribution occurs across one or + more blocks of cores, with subsets designated as threads. This pass rewrites these allocations to be private to each core/tile. So it replaces a large allocation in L1 with a smaller allocation, smaller by diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir index efbd2b931..130f92006 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir @@ -32,6 +32,30 @@ func.func @distribute_l1_memory_test_0() { // ----- +// CHECK-LABEL: distribute_l1_memory_nested_loops +// CHECK: %[[L2ALLOC:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2> +// CHECK: linalg.fill +// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>) +// CHECK: memref.dealloc %[[L2ALLOC]] : memref<1x1x32x32xi32, 2> + +#map = affine_map<()[s0, s1] -> (s0 + s1)> +func.func @distribute_l1_memory_nested_loops() { + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<4x4x32x32xi32, 2> + scf.forall (%arg0, %arg1) = (0, 0) to (4, 4) step (2, 2) { + scf.forall (%arg2, %arg3) in (2, 2) { + %0 = affine.apply #map()[%arg0, %arg2] + %1 = affine.apply #map()[%arg1, %arg3] + %subview = memref.subview %alloc[%0, %1, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<4x4x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2> + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2>) + } {mapping = [#gpu.thread, #gpu.thread]} + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<4x4x32x32xi32, 2> + return +} + +// ----- + // CHECK-LABEL: @transfer_read_test() // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2> // CHECK: vector.transfer_read %[[ALLOC]]