Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tuner] Update gpu pipeline option handling #282

Merged
merged 1 commit into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions tuner/tuner/candidate_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
import pickle
import re
import z3
from dataclasses import asdict, dataclass
from dataclasses import astuple, dataclass
from enum import Enum
from os import mkdir, path, makedirs
from typing import Callable, Optional
from typing import Optional
from textwrap import indent
from abc import ABC, abstractmethod

Expand Down Expand Up @@ -148,6 +148,44 @@ def all():
]


class ReorderWorkgroupsStrategy(Enum):
NONE = 0
SWIZZLE = 1
TRANSPOSE = 2

def __str__(self) -> str:
return self.name.title()


@dataclass
class GpuPipelineOptions:
"""Represents the `iree_gpu.pipeline_options` attribute"""

prefetch_shared_memory: Optional[bool] = None
no_reduce_shared_memory_bank_conflicts: Optional[bool] = None
reorder_workgroups_strategy: Optional[ReorderWorkgroupsStrategy] = None

def all_default(self) -> bool:
return all(x is None for x in astuple(self))

def __str__(self) -> str:
options: list[str] = []
if self.prefetch_shared_memory is not None:
options.append(
f"prefetch_shared_memory = {str(self.prefetch_shared_memory).lower()}"
)
if self.no_reduce_shared_memory_bank_conflicts is not None:
options.append(
f"no_reduce_shared_memory_bank_conflicts = {str(self.no_reduce_shared_memory_bank_conflicts).lower()}"
)
if self.reorder_workgroups_strategy is not None:
options.append(
f"reorder_workgroups_strategy = {self.reorder_workgroups_strategy}"
)

return f"#iree_gpu.pipeline_options<{', '.join(options)}>"


@dataclass
class Configuration:
subgroup_size: int
Expand All @@ -156,6 +194,7 @@ class Configuration:
tile_sizes: list[int]
subgroup_m_count: int
subgroup_n_count: int
gpu_pipeline_options: GpuPipelineOptions
waves_per_eu: int


Expand Down Expand Up @@ -223,7 +262,9 @@ def get_batch_mmt_tile_sizes(configuration: Configuration) -> list[int]:


def get_pipeline_config(configuration: Configuration) -> str:
extra_config = ", prefetch_shared_memory"
extra_config = ""
if not configuration.gpu_pipeline_options.all_default():
extra_config += f", gpu_pipeline_options = {configuration.gpu_pipeline_options}"
if configuration.waves_per_eu != 2:
extra_config += f', llvm_func_attrs = {{"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"}}'
return extra_config
Expand All @@ -234,17 +275,19 @@ def apply_configuration(
) -> str:
tune_logger.info(f"Applying: {configuration}")
expr0 = re.compile(
r"<intrinsic = #iree_gpu.mma_layout<(.+)>, subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>"
r"<intrinsic = #iree_gpu\.mma_layout<(.+)>, subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>"
)
expr1 = re.compile(
r"LLVMGPUVectorDistribute workgroup_size = \[.+\] subgroup_size = ([0-9]+),"
)
expr2 = re.compile(r"tile_sizes = \[\[([0-9]+)(, ([0-9]+))+\]\]")
expr3 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"")
expr3 = re.compile(r"gpu_pipeline_options = #iree_gpu\.pipeline_options<([^>]*)>")
expr4 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"")
repl0 = f"<intrinsic = #iree_gpu.mma_layout<{configuration.intrinsic}>, subgroup_m_count = {configuration.subgroup_m_count}, subgroup_n_count = {configuration.subgroup_n_count}>"
repl1 = f'LLVMGPUVectorDistribute workgroup_size = [{", ".join(map(str, configuration.workgroup_size))}] subgroup_size = {configuration.subgroup_size},'
repl2 = f'tile_sizes = [[{", ".join(map(str, tile_sizes))}]]'
repl3 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"'
repl3 = f"gpu_pipeline_options = {configuration.gpu_pipeline_options}"
repl4 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"'

new_mlir = ""
for line in template:
Expand All @@ -254,8 +297,10 @@ def apply_configuration(
line = re.sub(expr1, repl1, line)
if "tile_sizes" in line:
line = re.sub(expr2, repl2, line)
if "amdgpu-waves-per-eu" in line:
if "gpu_pipeline_options =" in line:
line = re.sub(expr3, repl3, line)
if "amdgpu-waves-per-eu" in line:
line = re.sub(expr4, repl4, line)
new_mlir += line

return new_mlir
Expand Down Expand Up @@ -461,6 +506,7 @@ def generate_solutions(problem_size: ProblemSize, num_subgrups: int):
[lookup(m), lookup(n), lookup(k)],
lookup(sg_m_cnt),
lookup(sg_n_cnt),
GpuPipelineOptions(),
lookup(waves_per_eu),
)
solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars)))))
Expand Down
92 changes: 78 additions & 14 deletions tuner/tuner/candidate_gen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_get_mmt_tile_sizes():
tile_sizes=[128, 320, 32],
subgroup_m_count=0,
subgroup_n_count=0,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=0,
)
assert candidate_gen.get_mmt_tile_sizes(config) == [128, 320, 32]
Expand All @@ -80,6 +81,7 @@ def test_get_conv_tile_sizes():
tile_sizes=[464, 320, 16],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=1,
)
assert candidate_gen.ConvTuner().get_conv_tile_sizes(config) == [
Expand All @@ -93,6 +95,32 @@ def test_get_conv_tile_sizes():
]


def test_gpu_pipeline_options():
options = candidate_gen.GpuPipelineOptions()
assert options.all_default()
assert str(options) == "#iree_gpu.pipeline_options<>"

options.prefetch_shared_memory = True
assert not options.all_default()
assert str(options) == "#iree_gpu.pipeline_options<prefetch_shared_memory = true>"

options.no_reduce_shared_memory_bank_conflicts = False
assert (
str(options)
== "#iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>"
)

options = candidate_gen.GpuPipelineOptions()
options.reorder_workgroups_strategy = (
candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE
)
assert not options.all_default()
assert (
str(options)
== "#iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>"
)


def test_get_contract_tile_sizes():
config = candidate_gen.Configuration(
subgroup_size=32,
Expand All @@ -101,6 +129,7 @@ def test_get_contract_tile_sizes():
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)
assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16]
Expand All @@ -114,28 +143,28 @@ def test_get_contract_tile_sizes():


def test_get_pipeline_config():
config1 = candidate_gen.Configuration(
config = candidate_gen.Configuration(
subgroup_size=32,
workgroup_size=[16, 16, 1],
intrinsic="",
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)
config2 = candidate_gen.Configuration(
subgroup_size=32,
workgroup_size=[16, 16, 1],
intrinsic="",
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
waves_per_eu=4,
)
assert candidate_gen.get_pipeline_config(config1) == ", prefetch_shared_memory"
config1_str: str = candidate_gen.get_pipeline_config(config)
assert config1_str == ""

config.waves_per_eu = 4
config2_str: str = candidate_gen.get_pipeline_config(config)
assert config2_str == ', llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'

config.gpu_pipeline_options.prefetch_shared_memory = True
config3_str = candidate_gen.get_pipeline_config(config)
assert (
candidate_gen.get_pipeline_config(config2)
== ', prefetch_shared_memory, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'
config3_str
== ', gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'
)


Expand Down Expand Up @@ -409,11 +438,18 @@ def test_generate_constraints_invalid_input():
assert solver.check() == candidate_gen.z3.unsat


def remove_comments(mlir: str) -> str:
return "\n".join(
filter(lambda x: not x.lstrip().startswith("//"), mlir.splitlines())
)


def test_apply_params_mmt():
mlir_template = [
"<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
"<LLVMGPUVectorDistribute workgroup_size = [16, 16] subgroup_size = 16,",
"<tile_sizes = [[8, 8, 8]]>",
"gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>",
'{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
]

Expand All @@ -426,6 +462,9 @@ def test_apply_params_mmt():
tile_sizes=[8, 8, 8],
subgroup_m_count=16,
subgroup_n_count=16,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(
prefetch_shared_memory=True
),
waves_per_eu=8,
)

Expand All @@ -442,6 +481,7 @@ def test_apply_params_mmt():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)
assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16"
Expand All @@ -452,6 +492,10 @@ def test_apply_params_mmt():
in modified
)
assert "tile_sizes = [[8, 8, 8]]" in modified
assert (
"gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>"
in modified
)
assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "8"}' in modified


Expand All @@ -460,7 +504,7 @@ def test_apply_params_conv():
"<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
"<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,",
"<tile_sizes = [[1, 1, 64, 128, 1, 1, 32]]>",
'{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
'gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>, {llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
]

n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 640
Expand All @@ -472,6 +516,9 @@ def test_apply_params_conv():
tile_sizes=[464, 320, 16],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(
reorder_workgroups_strategy=candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE
),
waves_per_eu=2,
)

Expand All @@ -492,6 +539,8 @@ def test_apply_params_conv():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)

assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4"
Expand All @@ -502,6 +551,10 @@ def test_apply_params_conv():
in modified
)
assert "tile_sizes = [[1, 1, 464, 320, 1, 1, 16]]" in modified
assert (
"gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>"
in modified
)
assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified


Expand Down Expand Up @@ -529,6 +582,7 @@ def test_apply_params_contract():
tile_sizes=[480, 384, 32],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand Down Expand Up @@ -575,6 +629,7 @@ def test_apply_params_batch_matmul():
tile_sizes=[416, 320, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand All @@ -586,6 +641,8 @@ def test_apply_params_batch_matmul():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)

assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 2, subgroup_n_count = 2"
Expand Down Expand Up @@ -622,6 +679,7 @@ def test_apply_params_batch_mmt_float():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand Down Expand Up @@ -669,6 +727,7 @@ def test_apply_params_batch_mmt_int():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=4,
)

Expand All @@ -681,6 +740,8 @@ def test_apply_params_batch_mmt_int():

assert modified
assert "// transform.named_sequence @match_batch_mmt_2x4096x640x640(" in modified
modified = remove_comments(modified)

assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
in modified
Expand Down Expand Up @@ -737,6 +798,7 @@ def test_apply_params_broadcast_rhs_mmt():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=4,
)

Expand All @@ -752,6 +814,8 @@ def test_apply_params_broadcast_rhs_mmt():
"// transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x640x640("
in modified
)
modified = remove_comments(modified)

assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
in modified
Expand Down
Loading