From e623a6a36e7b92b46501f7f1c952b114da990a6a Mon Sep 17 00:00:00 2001
From: Vivian <vivian.zhang@amd.com>
Date: Tue, 10 Dec 2024 08:50:44 -0800
Subject: [PATCH] [DistributeL1Allocations] Include cases when the offset is a
 user of affineApplyOp (#975)

Without this fix, the L1 allocation in the new added test would fail to
distribute.
---
 .../AMDAIEDistributeL1Allocations.cpp         | 23 +++++++++++++-----
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |  7 +++---
 .../test/distribute_l1_allocations.mlir       | 24 +++++++++++++++++++
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
index 0af3e8b32..6deab3c53 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
@@ -53,16 +53,27 @@ MemRefType getDistributedType(memref::AllocOp alloc,
   MemRefType type;
   for (Operation *allocUser : alloc->getUsers()) {
     if (auto subview = dyn_cast<memref::SubViewOp>(allocUser)) {
-      // Check that all offsets are either constants or thread ids. We assume
-      // that if a subview has an offset which is not a constant and not a
-      // thread id, it's not 'distributing'.
+      // Check that all offsets are either constants or depending on thread ids.
+      // We assume that if a subview has an offset which is not a constant and
+      // does not depend on thread id, it's not 'distributing'.
       Operation::operand_range offsets = subview.getOffsets();
       int nIndVars{0};
       for (Value offset : offsets) {
         bool isConst = matchPattern(offset, m_Constant());
-        bool isIndVar = llvm::is_contained(indVars, offset);
-        nIndVars += isIndVar;
-        if (!isConst && !isIndVar) return {};
+        bool dependsOnIndVar = false;
+        if (!isConst &&
+            isa_and_present<affine::AffineApplyOp>(offset.getDefiningOp())) {
+          auto applyOp = cast<affine::AffineApplyOp>(offset.getDefiningOp());
+          for (auto operand : applyOp.getSymbolOperands()) {
+            dependsOnIndVar = llvm::is_contained(indVars, operand);
+            if (dependsOnIndVar) break;
+          }
+        } else {
+          dependsOnIndVar = llvm::is_contained(indVars, offset);
+        }
+
+        nIndVars += dependsOnIndVar;
+        if (!isConst && !dependsOnIndVar) return {};
       }
 
       // If there are no thread ids, this subview is not distributing.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 9d01a5bf5..171326d82 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -219,14 +219,13 @@ def AMDAIEDistributeL1Allocations :
   Pass<"iree-amdaie-distribute-l1-allocations", "ModuleOp"> {
   let summary = "Replace distributed L1 allocations with private allocations.";
   let description = [{
-    Each AIE core/tile is uniquely identified by gpu thread ids, usually
-    'y' (for AIE row) and 'x' (for AIE column).
-
     Some of the compilation pipelines in iree-amd-aie generate a single L1
     memory allocation describing the concatenation of all memory for all
     cores/tiles. Each thread then slices into a mutually exclusive rectangle
     of the allocation, along its thread dimensions, so 'privatizing' its
-    memory.
+    memory. Note the thread dimension is specified by the `GPUThreadMappingAttr`
+    in `scf.forall` ops, indicating that the distribution occurs across one or
+    more blocks of cores, with subsets designated as threads.
 
     This pass rewrites these allocations to be private to each core/tile. So
     it replaces a large allocation in L1 with a smaller allocation, smaller by
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
index efbd2b931..130f92006 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
@@ -32,6 +32,30 @@ func.func @distribute_l1_memory_test_0() {
 
 // -----
 
+// CHECK-LABEL: distribute_l1_memory_nested_loops
+// CHECK: %[[L2ALLOC:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2>
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>)
+// CHECK: memref.dealloc %[[L2ALLOC]] : memref<1x1x32x32xi32, 2>
+
+#map = affine_map<()[s0, s1] -> (s0 + s1)>
+func.func @distribute_l1_memory_nested_loops() {
+  %c0_i32 = arith.constant 0 : i32
+  %alloc = memref.alloc() : memref<4x4x32x32xi32, 2>
+  scf.forall (%arg0, %arg1) = (0, 0) to (4, 4) step (2, 2) {
+    scf.forall (%arg2, %arg3) in (2, 2) {
+      %0 = affine.apply #map()[%arg0, %arg2]
+      %1 = affine.apply #map()[%arg1, %arg3]
+      %subview = memref.subview %alloc[%0, %1, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<4x4x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2>
+      linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[4096, 1024, 32, 1], offset: ?>, 2>)
+    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+  memref.dealloc %alloc : memref<4x4x32x32xi32, 2>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @transfer_read_test()
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2>
 // CHECK: vector.transfer_read %[[ALLOC]]