Skip to content

Commit

Permalink
[Codegen][LLVMGPU] Avoid long compilation times of warp reduction pip…
Browse files Browse the repository at this point in the history
…eline.

The warp reduction pipeline tile size logic isnt very robust for
dynamic dimensions. For now use a fallback in case where dynamic
dimensions exist to allow for reasonable compilation times.

Signed-off-by: MaheshRavishankar <[email protected]>
  • Loading branch information
MaheshRavishankar committed Dec 5, 2024
1 parent c3db710 commit a2b5610
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 4 deletions.
5 changes: 3 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,7 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
return failure();
}
}
int numDynamicDims = llvm::count_if(bounds, ShapedType::isDynamic);

// Distribution of multi-dim masked writes currently aren't fully supported.
if (numDynamicReductionDims > 1) {
Expand Down Expand Up @@ -1617,9 +1618,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1;
SmallVector<int64_t> workgroupTileSizes(numLoops, 1);

// Without any bounds on dynamic reduction dims, we need specialization to
// Without any bounds on dynamic dims, we need specialization to
// get peak performance. For now, just use the warp size.
if (numDynamicReductionDims) {
if (numDynamicDims) {
SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
int64_t preferredSubgroupSize = target.getPreferredSubgroupSize();
reductionTileSizes[reductionDims[0]] = preferredSubgroupSize;
Expand Down
31 changes: 31 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,34 @@ func.func @not_vmt() {
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[$CONFIG]]

// -----

func.func @dynamic_parallel_dims(%dynsize : index, %input : tensor<4x?x4096xf16>) -> tensor<4x?xf32> {
%cst = arith.constant 0.0 : f32
%0 = tensor.empty(%dynsize) : tensor<4x?xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x?xf32>) -> tensor<4x?xf32>
%2 = linalg.generic {
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"]}
ins(%input : tensor<4x?x4096xf16>) outs(%1 : tensor<4x?xf32>) {
^bb0(%in: f16, %out: f32):
%3 = arith.extf %in : f16 to f32
%4 = arith.addf %3, %out : f32
linalg.yield %4 : f32
} -> tensor<4x?xf32>
return %2 : tensor<4x?xf32>
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 64]{{\]}}
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1]>
// CHECK: func @dynamic_parallel_dims
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]

// CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}
// CDNA3-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
// CDNA3: func @dynamic_parallel_dims
// CDNA3-SAME: translation_info = #[[TRANSLATION]]
// CDNA3: linalg.generic
// CDNA3-SAME: lowering_config = #[[CONFIG]]
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ func.func @i4_dequant_matvec() {
return
}

// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 256]{{\]}}>
// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}>
// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
// CHECK-LABEL: func.func @i4_dequant_matvec()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
Expand Down

0 comments on commit a2b5610

Please sign in to comment.