From e700bfa78f29888cbd881986708e26898f7a7ec7 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 16 Oct 2024 15:29:19 -0400 Subject: [PATCH] [tuner] Update gpu pipeline option handling (#282) Use the attribute format introduced in https://github.com/iree-org/iree/pull/18458. Fixes: https://github.com/nod-ai/SHARK-Platform/issues/186 --- tuner/tuner/candidate_gen.py | 60 +++++++++++++++++--- tuner/tuner/candidate_gen_test.py | 92 ++++++++++++++++++++++++++----- 2 files changed, 131 insertions(+), 21 deletions(-) diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py index 16f0cf724..40eb27a82 100644 --- a/tuner/tuner/candidate_gen.py +++ b/tuner/tuner/candidate_gen.py @@ -24,10 +24,10 @@ import pickle import re import z3 -from dataclasses import asdict, dataclass +from dataclasses import astuple, dataclass from enum import Enum from os import mkdir, path, makedirs -from typing import Callable, Optional +from typing import Optional from textwrap import indent from abc import ABC, abstractmethod @@ -148,6 +148,44 @@ def all(): ] +class ReorderWorkgroupsStrategy(Enum): + NONE = 0 + SWIZZLE = 1 + TRANSPOSE = 2 + + def __str__(self) -> str: + return self.name.title() + + +@dataclass +class GpuPipelineOptions: + """Represents the `iree_gpu.pipeline_options` attribute""" + + prefetch_shared_memory: Optional[bool] = None + no_reduce_shared_memory_bank_conflicts: Optional[bool] = None + reorder_workgroups_strategy: Optional[ReorderWorkgroupsStrategy] = None + + def all_default(self) -> bool: + return all(x is None for x in astuple(self)) + + def __str__(self) -> str: + options: list[str] = [] + if self.prefetch_shared_memory is not None: + options.append( + f"prefetch_shared_memory = {str(self.prefetch_shared_memory).lower()}" + ) + if self.no_reduce_shared_memory_bank_conflicts is not None: + options.append( + f"no_reduce_shared_memory_bank_conflicts = {str(self.no_reduce_shared_memory_bank_conflicts).lower()}" + ) + if self.reorder_workgroups_strategy is not None: + options.append( + f"reorder_workgroups_strategy = {self.reorder_workgroups_strategy}" + ) + + return f"#iree_gpu.pipeline_options<{', '.join(options)}>" + + @dataclass class Configuration: subgroup_size: int @@ -156,6 +194,7 @@ class Configuration: tile_sizes: list[int] subgroup_m_count: int subgroup_n_count: int + gpu_pipeline_options: GpuPipelineOptions waves_per_eu: int @@ -223,7 +262,9 @@ def get_batch_mmt_tile_sizes(configuration: Configuration) -> list[int]: def get_pipeline_config(configuration: Configuration) -> str: - extra_config = ", prefetch_shared_memory" + extra_config = "" + if not configuration.gpu_pipeline_options.all_default(): + extra_config += f", gpu_pipeline_options = {configuration.gpu_pipeline_options}" if configuration.waves_per_eu != 2: extra_config += f', llvm_func_attrs = {{"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"}}' return extra_config @@ -234,17 +275,19 @@ def apply_configuration( ) -> str: tune_logger.info(f"Applying: {configuration}") expr0 = re.compile( - r", subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>" + r", subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>" ) expr1 = re.compile( r"LLVMGPUVectorDistribute workgroup_size = \[.+\] subgroup_size = ([0-9]+)," ) expr2 = re.compile(r"tile_sizes = \[\[([0-9]+)(, ([0-9]+))+\]\]") - expr3 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"") + expr3 = re.compile(r"gpu_pipeline_options = #iree_gpu\.pipeline_options<([^>]*)>") + expr4 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"") repl0 = f", subgroup_m_count = {configuration.subgroup_m_count}, subgroup_n_count = {configuration.subgroup_n_count}>" repl1 = f'LLVMGPUVectorDistribute workgroup_size = [{", ".join(map(str, configuration.workgroup_size))}] subgroup_size = {configuration.subgroup_size},' repl2 = f'tile_sizes = [[{", ".join(map(str, tile_sizes))}]]' - repl3 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"' + repl3 = f"gpu_pipeline_options = {configuration.gpu_pipeline_options}" + repl4 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"' new_mlir = "" for line in template: @@ -254,8 +297,10 @@ def apply_configuration( line = re.sub(expr1, repl1, line) if "tile_sizes" in line: line = re.sub(expr2, repl2, line) - if "amdgpu-waves-per-eu" in line: + if "gpu_pipeline_options =" in line: line = re.sub(expr3, repl3, line) + if "amdgpu-waves-per-eu" in line: + line = re.sub(expr4, repl4, line) new_mlir += line return new_mlir @@ -461,6 +506,7 @@ def generate_solutions(problem_size: ProblemSize, num_subgrups: int): [lookup(m), lookup(n), lookup(k)], lookup(sg_m_cnt), lookup(sg_n_cnt), + GpuPipelineOptions(), lookup(waves_per_eu), ) solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars))))) diff --git a/tuner/tuner/candidate_gen_test.py b/tuner/tuner/candidate_gen_test.py index 392f8bc06..2924db75b 100644 --- a/tuner/tuner/candidate_gen_test.py +++ b/tuner/tuner/candidate_gen_test.py @@ -67,6 +67,7 @@ def test_get_mmt_tile_sizes(): tile_sizes=[128, 320, 32], subgroup_m_count=0, subgroup_n_count=0, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=0, ) assert candidate_gen.get_mmt_tile_sizes(config) == [128, 320, 32] @@ -80,6 +81,7 @@ def test_get_conv_tile_sizes(): tile_sizes=[464, 320, 16], subgroup_m_count=1, subgroup_n_count=4, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=1, ) assert candidate_gen.ConvTuner().get_conv_tile_sizes(config) == [ @@ -93,6 +95,32 @@ def test_get_conv_tile_sizes(): ] +def test_gpu_pipeline_options(): + options = candidate_gen.GpuPipelineOptions() + assert options.all_default() + assert str(options) == "#iree_gpu.pipeline_options<>" + + options.prefetch_shared_memory = True + assert not options.all_default() + assert str(options) == "#iree_gpu.pipeline_options" + + options.no_reduce_shared_memory_bank_conflicts = False + assert ( + str(options) + == "#iree_gpu.pipeline_options" + ) + + options = candidate_gen.GpuPipelineOptions() + options.reorder_workgroups_strategy = ( + candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE + ) + assert not options.all_default() + assert ( + str(options) + == "#iree_gpu.pipeline_options" + ) + + def test_get_contract_tile_sizes(): config = candidate_gen.Configuration( subgroup_size=32, @@ -101,6 +129,7 @@ def test_get_contract_tile_sizes(): tile_sizes=[4, 8, 16], subgroup_m_count=1, subgroup_n_count=1, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=2, ) assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16] @@ -114,28 +143,28 @@ def test_get_contract_tile_sizes(): def test_get_pipeline_config(): - config1 = candidate_gen.Configuration( + config = candidate_gen.Configuration( subgroup_size=32, workgroup_size=[16, 16, 1], intrinsic="", tile_sizes=[4, 8, 16], subgroup_m_count=1, subgroup_n_count=1, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=2, ) - config2 = candidate_gen.Configuration( - subgroup_size=32, - workgroup_size=[16, 16, 1], - intrinsic="", - tile_sizes=[4, 8, 16], - subgroup_m_count=1, - subgroup_n_count=1, - waves_per_eu=4, - ) - assert candidate_gen.get_pipeline_config(config1) == ", prefetch_shared_memory" + config1_str: str = candidate_gen.get_pipeline_config(config) + assert config1_str == "" + + config.waves_per_eu = 4 + config2_str: str = candidate_gen.get_pipeline_config(config) + assert config2_str == ', llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' + + config.gpu_pipeline_options.prefetch_shared_memory = True + config3_str = candidate_gen.get_pipeline_config(config) assert ( - candidate_gen.get_pipeline_config(config2) - == ', prefetch_shared_memory, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' + config3_str + == ', gpu_pipeline_options = #iree_gpu.pipeline_options, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' ) @@ -409,11 +438,18 @@ def test_generate_constraints_invalid_input(): assert solver.check() == candidate_gen.z3.unsat +def remove_comments(mlir: str) -> str: + return "\n".join( + filter(lambda x: not x.lstrip().startswith("//"), mlir.splitlines()) + ) + + def test_apply_params_mmt(): mlir_template = [ ", subgroup_m_count = 16, subgroup_n_count = 16>", "", + "gpu_pipeline_options = #iree_gpu.pipeline_options", '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}', ] @@ -426,6 +462,9 @@ def test_apply_params_mmt(): tile_sizes=[8, 8, 8], subgroup_m_count=16, subgroup_n_count=16, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions( + prefetch_shared_memory=True + ), waves_per_eu=8, ) @@ -442,6 +481,7 @@ def test_apply_params_mmt(): embeddable = tf_mlir.embeddable assert modified + modified = remove_comments(modified) assert embeddable assert ( "intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 16, subgroup_n_count = 16" @@ -452,6 +492,10 @@ def test_apply_params_mmt(): in modified ) assert "tile_sizes = [[8, 8, 8]]" in modified + assert ( + "gpu_pipeline_options = #iree_gpu.pipeline_options" + in modified + ) assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "8"}' in modified @@ -460,7 +504,7 @@ def test_apply_params_conv(): ", subgroup_m_count = 16, subgroup_n_count = 16>", "", - '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}', + 'gpu_pipeline_options = #iree_gpu.pipeline_options, {llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}', ] n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 640 @@ -472,6 +516,9 @@ def test_apply_params_conv(): tile_sizes=[464, 320, 16], subgroup_m_count=1, subgroup_n_count=4, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions( + reorder_workgroups_strategy=candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE + ), waves_per_eu=2, ) @@ -492,6 +539,8 @@ def test_apply_params_conv(): embeddable = tf_mlir.embeddable assert modified + modified = remove_comments(modified) + assert embeddable assert ( "intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 4" @@ -502,6 +551,10 @@ def test_apply_params_conv(): in modified ) assert "tile_sizes = [[1, 1, 464, 320, 1, 1, 16]]" in modified + assert ( + "gpu_pipeline_options = #iree_gpu.pipeline_options" + in modified + ) assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified @@ -529,6 +582,7 @@ def test_apply_params_contract(): tile_sizes=[480, 384, 32], subgroup_m_count=1, subgroup_n_count=4, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=2, ) @@ -575,6 +629,7 @@ def test_apply_params_batch_matmul(): tile_sizes=[416, 320, 128], subgroup_m_count=2, subgroup_n_count=2, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=2, ) @@ -586,6 +641,8 @@ def test_apply_params_batch_matmul(): embeddable = tf_mlir.embeddable assert modified + modified = remove_comments(modified) + assert embeddable assert ( "intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2" @@ -622,6 +679,7 @@ def test_apply_params_batch_mmt_float(): tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=2, ) @@ -669,6 +727,7 @@ def test_apply_params_batch_mmt_int(): tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=4, ) @@ -681,6 +740,8 @@ def test_apply_params_batch_mmt_int(): assert modified assert "// transform.named_sequence @match_batch_mmt_2x4096x640x640(" in modified + modified = remove_comments(modified) + assert ( "intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2" in modified @@ -737,6 +798,7 @@ def test_apply_params_broadcast_rhs_mmt(): tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, + gpu_pipeline_options=candidate_gen.GpuPipelineOptions(), waves_per_eu=4, ) @@ -752,6 +814,8 @@ def test_apply_params_broadcast_rhs_mmt(): "// transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x640x640(" in modified ) + modified = remove_comments(modified) + assert ( "intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2" in modified