From a2b5610937a6be51b025e7ffd4d79cc8e7438100 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <mravisha@amd.com>
Date: Wed, 4 Dec 2024 23:58:09 -0600
Subject: [PATCH] [Codegen][LLVMGPU] Avoid long compilation times of warp
 reduction pipeline.

The warp reduction pipeline tile size logic isnt very robust for
dynamic dimensions. For now use a fallback in case where dynamic
dimensions exist to allow for reasonable compilation times.

Signed-off-by: MaheshRavishankar <mravisha@amd.com>
---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  5 +--
 .../Codegen/LLVMGPU/test/config_matvec.mlir   | 31 +++++++++++++++++++
 .../LLVMGPU/test/gpu_set_num_workgroups.mlir  |  4 +--
 3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 23e5cbb13e27..debbf0fefcd6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -1577,6 +1577,7 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
       return failure();
     }
   }
+  int numDynamicDims = llvm::count_if(bounds, ShapedType::isDynamic);
 
   // Distribution of multi-dim masked writes currently aren't fully supported.
   if (numDynamicReductionDims > 1) {
@@ -1617,9 +1618,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
   size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1;
   SmallVector<int64_t> workgroupTileSizes(numLoops, 1);
 
-  // Without any bounds on dynamic reduction dims, we need specialization to
+  // Without any bounds on dynamic dims, we need specialization to
   // get peak performance. For now, just use the warp size.
-  if (numDynamicReductionDims) {
+  if (numDynamicDims) {
     SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
     int64_t preferredSubgroupSize = target.getPreferredSubgroupSize();
     reductionTileSizes[reductionDims[0]] = preferredSubgroupSize;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index 5262c3460fad..1e5dbf63f2f9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -273,3 +273,34 @@ func.func @not_vmt() {
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //       CHECK:   linalg.generic
 //  CHECK-SAME:       lowering_config = #[[$CONFIG]]
+
+// -----
+
+func.func @dynamic_parallel_dims(%dynsize : index, %input : tensor<4x?x4096xf16>) -> tensor<4x?xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.empty(%dynsize) : tensor<4x?xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x?xf32>) -> tensor<4x?xf32>
+  %2 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"]}
+      ins(%input : tensor<4x?x4096xf16>) outs(%1 : tensor<4x?xf32>) {
+    ^bb0(%in: f16, %out: f32):
+      %3 = arith.extf %in : f16 to f32
+      %4 = arith.addf %3, %out : f32
+      linalg.yield %4 : f32
+    } -> tensor<4x?xf32>
+  return %2 : tensor<4x?xf32>
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 64]{{\]}}
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1]>
+//      CHECK: func @dynamic_parallel_dims
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+//  CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}
+//  CDNA3-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
+//      CDNA3: func @dynamic_parallel_dims
+// CDNA3-SAME:     translation_info = #[[TRANSLATION]]
+//      CDNA3:   linalg.generic
+// CDNA3-SAME:       lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 6145ebd9688f..feb0e2766303 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -743,8 +743,8 @@ func.func @i4_dequant_matvec() {
   return
 }
 
-//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 256]{{\]}}>
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
 // CHECK-LABEL: func.func @i4_dequant_matvec()
 //  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
 //       CHECK:   linalg.generic