Skip to content

Commit

Permalink
[tuner]: add amdgpu-waves-per-eu into translation_info
Browse files Browse the repository at this point in the history
Signed-off-by: Bangtian Liu <[email protected]>
  • Loading branch information
bangtianliu committed Dec 10, 2024
1 parent 6db201a commit 745d6e3
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 191 deletions.
101 changes: 15 additions & 86 deletions tuner/tuner/candidate_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ def apply_configuration(
workgroup_sizes = lowering_config.workgroup_tile_sizes
reduction_sizes = lowering_config.reduction_tile_sizes
gpu_pipeline_options = configuration.translation_info.configuration[
GPU_PIPELINE_OPTIONS
GPU_PIPELINE_OPTIONS_KEY
]
waves_per_eu = configuration.translation_info.configuration[LLVM_FUNC_ATTRS_KEY][
WAVES_PER_EU_KEY
]
tune_logger.info(f"Applying: {configuration}")
expr0 = re.compile(
Expand All @@ -70,7 +73,7 @@ def apply_configuration(
repl2 = f"workgroup = {workgroup_sizes}"
repl3 = f"reduction = {reduction_sizes}"
repl4 = f"gpu_pipeline_options = {gpu_pipeline_options}"
repl5 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"'
repl5 = f'"amdgpu-waves-per-eu" = {waves_per_eu}'

new_mlir = ""
for line in template:
Expand Down Expand Up @@ -131,15 +134,6 @@ class MmtTuner(DispatchTuner, MmtParser):
def get_transform_function_mmt(
self, problem_size: ProblemSize, functionName: str, configuration: Configuration
) -> str:
lowering_config = configuration.lowering_config
intrinsic = lowering_config.mma_kind
(
subgroup_m_count,
subgroup_n_count,
) = lowering_config.subgroup_count_mn

wg_x, wg_y, wg_z = configuration.translation_info.workgroup_size
extra_config = get_pipeline_config(configuration)
return f"""
transform.named_sequence @{functionName}(%matmul: !transform.any_op {{transform.readonly}}) -> (!transform.any_op, !transform.any_param) {{
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
Expand All @@ -148,13 +142,8 @@ def get_transform_function_mmt(
transform.iree.match.cast_compatible_type %lhs = tensor<{problem_size.lhs_type}> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = {configuration.lowering_config}>,
translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [{wg_x}, {wg_y}, {wg_z}] subgroup_size = {configuration.translation_info.subgroup_size},
{{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = {intrinsic},
subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>
{extra_config}}}>
lowering_config = {configuration.lowering_config},
translation_info = {configuration.translation_info}
> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}}
Expand Down Expand Up @@ -200,16 +189,6 @@ def get_transform_function_conv(
filter = f"tensor<{problem_size.rhs_type}>"
output = f"tensor<{dynamic_batch_output_ty}>"

lowering_config = configuration.lowering_config
intrinsic = lowering_config.mma_kind
(
subgroup_m_count,
subgroup_n_count,
) = lowering_config.subgroup_count_mn

wg_x, wg_y, wg_z = configuration.translation_info.workgroup_size
extra_config = get_pipeline_config(configuration)

return f"""
transform.named_sequence @{functionName}(%conv: !transform.any_op {{transform.readonly}})
-> (!transform.any_op, !transform.any_param) {{
Expand All @@ -220,13 +199,8 @@ def get_transform_function_conv(
outs(%out : {output}) -> {output}
}} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = {configuration.lowering_config}>,
translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [{wg_x}, {wg_y}, {wg_z}] subgroup_size = {configuration.translation_info.subgroup_size},
{{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = {intrinsic},
subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>
{extra_config}}}>
lowering_config = {configuration.lowering_config},
translation_info = {configuration.translation_info}
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}}
Expand Down Expand Up @@ -265,16 +239,6 @@ def get_transform_function_broadcast_rhs_mmt(
functionName: str,
configuration: Configuration,
) -> str:
lowering_config = configuration.lowering_config
intrinsic = lowering_config.mma_kind
(
subgroup_m_count,
subgroup_n_count,
) = lowering_config.subgroup_count_mn

wg_x, wg_y, wg_z = configuration.translation_info.workgroup_size
extra_config = get_pipeline_config(configuration)

lhs_dynamic_batch = problem_size.lhs_type
lhs_dynamic_batch.shape = lhs_dynamic_batch.shape.copy()
lhs_dynamic_batch.shape[0] = -1
Expand All @@ -287,13 +251,8 @@ def get_transform_function_broadcast_rhs_mmt(
transform.iree.match.cast_compatible_type %lhs = tensor<{lhs_dynamic_batch}> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = {configuration.lowering_config}>,
translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [{wg_x}, {wg_y}, {wg_z}] subgroup_size = {configuration.translation_info.subgroup_size},
{{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = {intrinsic},
subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>
{extra_config}}}>
lowering_config = {configuration.lowering_config},
translation_info = {configuration.translation_info}
> -> !transform.any_param
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}}
Expand Down Expand Up @@ -354,16 +313,6 @@ def get_transform_function_batch_mmt(
functionName: str,
configuration: Configuration,
) -> str:
lowering_config = configuration.lowering_config
intrinsic = lowering_config.mma_kind
(
subgroup_m_count,
subgroup_n_count,
) = lowering_config.subgroup_count_mn

wg_x, wg_y, wg_z = configuration.translation_info.workgroup_size
extra_config = get_pipeline_config(configuration)

return f"""
transform.named_sequence @{functionName}(%generic: !transform.any_op {{transform.readonly}}) -> (!transform.any_op, !transform.any_param) {{
%mmt = transform.include @match_batch_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
Expand All @@ -372,13 +321,8 @@ def get_transform_function_batch_mmt(
transform.iree.match.cast_compatible_type %lhs = tensor<{problem_size.lhs_type}> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = {configuration.lowering_config}>,
translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [{wg_x}, {wg_y}, {wg_z}] subgroup_size = {configuration.translation_info.subgroup_size},
{{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = {intrinsic},
subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>
{extra_config}}}>
lowering_config = {configuration.lowering_config},
translation_info ={configuration.translation_info}
> -> !transform.any_param
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}}
Expand Down Expand Up @@ -424,16 +368,6 @@ def get_transform_function_batch_matmul(
input1 = f"tensor<{problem_size.rhs_type}>"
output = f"tensor<{problem_size.res_type}>"

lowering_config = configuration.lowering_config
intrinsic = lowering_config.mma_kind
(
subgroup_m_count,
subgroup_n_count,
) = lowering_config.subgroup_count_mn

wg_x, wg_y, wg_z = configuration.translation_info.workgroup_size
extra_config = get_pipeline_config(configuration)

return f"""
transform.named_sequence @{functionName}(%batch_matmul: !transform.any_op {{transform.readonly}})
-> (!transform.any_op, !transform.any_param) {{
Expand All @@ -444,13 +378,8 @@ def get_transform_function_batch_matmul(
outs(%out : {output}) -> {output}
}} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = {configuration.lowering_config}>,
translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute
workgroup_size = [{wg_x}, {wg_y}, {wg_z}] subgroup_size = {configuration.translation_info.subgroup_size},
{{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = {intrinsic},
subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>
{extra_config}}}>
lowering_config = {configuration.lowering_config},
translation_info = {configuration.translation_info}
> -> !transform.any_param
transform.yield %batch_matmul, %config : !transform.any_op, !transform.any_param
}}
Expand Down
105 changes: 63 additions & 42 deletions tuner/tuner/candidate_gen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,22 @@ def test_apply_params_mmt(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=16,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get(prefetch_shared_memory=True)
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get(prefetch_shared_memory=True)
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("8")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [16, 16, 1], 16, pipeline_option_dict
pipeline_attr, None, [16, 16, 1], 16, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=8,
)

problem_size = common.ProblemSize(
Expand Down Expand Up @@ -126,23 +129,26 @@ def test_apply_params_conv(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=4,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get(
pipeline_options = iree_gpu.PipelineOptionsAttr.get(
reorder_workgroups_strategy=iree_gpu.ReorderWorkgroupsStrategyAttr.get(
iree_gpu.ReorderWorkgroupsStrategy.Transpose
)
)
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("2")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [256, 1, 1], 64, pipeline_option_dict
pipeline_attr, None, [256, 1, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=2,
)

problem_size = common.ProblemSize(
Expand Down Expand Up @@ -207,19 +213,22 @@ def test_apply_params_contract(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=4,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get()
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get()
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("2")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [256, 1, 1], 64, pipeline_option_dict
pipeline_attr, None, [256, 1, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=2,
)

tf_mlir = candidate_gen.ContractionTuner("mk", "nk", tile_dims).apply_params(
Expand Down Expand Up @@ -270,19 +279,22 @@ def test_apply_params_batch_matmul(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=2,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get()
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get()
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("2")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [128, 2, 1], 64, pipeline_option_dict
pipeline_attr, None, [128, 2, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=2,
)

tf_mlir = candidate_gen.BatchMatmulTuner("mk", "nk", tile_dims).apply_params(
Expand Down Expand Up @@ -336,19 +348,22 @@ def test_apply_params_batch_mmt_float(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=2,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get()
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get()
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("2")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [128, 2, 1], 64, pipeline_option_dict
pipeline_attr, None, [128, 2, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=2,
)

tf_mlir = candidate_gen.BatchMmtTuner().apply_params(
Expand Down Expand Up @@ -400,19 +415,22 @@ def test_apply_params_batch_mmt_int(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=2,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get()
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get()
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("4")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [128, 2, 1], 64, pipeline_option_dict
pipeline_attr, None, [128, 2, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=4,
)

tf_mlir = candidate_gen.BatchMmtTuner().apply_params(
Expand Down Expand Up @@ -488,19 +506,22 @@ def test_apply_params_broadcast_rhs_mmt(tuner_ctx: common.TunerContext) -> None:
subgroup_n_count=2,
)
pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorize
iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
)
pipeline_option = iree_gpu.PipelineOptionsAttr.get()
pipeline_option_dict = ir.DictAttr.get(
{common.GPU_PIPELINE_OPTIONS: pipeline_option}
pipeline_options = iree_gpu.PipelineOptionsAttr.get()
waves_per_eu_dict = ir.DictAttr.get({"amdgpu-waves-per-eu": ir.StringAttr.get("4")})
config_dict = ir.DictAttr.get(
{
common.GPU_PIPELINE_OPTIONS_KEY: pipeline_options,
common.LLVM_FUNC_ATTRS_KEY: waves_per_eu_dict,
}
)
translation_info = iree_codegen.TranslationInfoAttr.get(
pipeline_attr, None, [128, 2, 1], 64, pipeline_option_dict
pipeline_attr, None, [128, 2, 1], 64, config_dict
)
config = common.Configuration(
translation_info=translation_info,
lowering_config=lowering_config,
waves_per_eu=4,
)

tf_mlir = candidate_gen.ContractionTuner(
Expand Down
Loading

0 comments on commit 745d6e3

Please sign in to comment.