From cf76f86d60b9e8e739163f11e60a48d666826496 Mon Sep 17 00:00:00 2001
From: Amily Wu <amilywu2@amd.com>
Date: Thu, 22 Aug 2024 14:28:11 -0500
Subject: [PATCH] Move test. Edit CI

---
 .github/workflows/ci.yaml                     |   2 +
 .../tools/tuner/candidate_gen_test.py         | 784 ------------------
 tuner-requirements.txt                        |   3 +
 3 files changed, 5 insertions(+), 784 deletions(-)
 delete mode 100644 sharktank/sharktank/tools/tuner/candidate_gen_test.py
 create mode 100644 tuner-requirements.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index b2d302a8e..93d982446 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -53,9 +53,11 @@ jobs:
           # from non default locations first. Installing the PyTorch CPU
           # wheels saves multiple minutes and a lot of bandwidth on runner setup.
           pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -r tuner-requirements.txt
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=shark-turbine"
           pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+          pip install --no-compile -f https://iree.dev/pip-release-links.html iree-compiler iree-runtime
 
       - name: Run sharktank tests
         if: ${{ !cancelled() }}
diff --git a/sharktank/sharktank/tools/tuner/candidate_gen_test.py b/sharktank/sharktank/tools/tuner/candidate_gen_test.py
deleted file mode 100644
index ad9b97e0a..000000000
--- a/sharktank/sharktank/tools/tuner/candidate_gen_test.py
+++ /dev/null
@@ -1,784 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import pytest
-import candidate_gen
-
-"""
-Usage: python -m pytest test_tune.py
-"""
-
-
-def test_get_shaped_type_element_bitwidth():
-    assert (
-        candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8).bitwidth
-        == 8
-    )
-    assert (
-        candidate_gen.ShapedType([2048], candidate_gen.ElementType.i32).bitwidth == 32
-    )
-    assert (
-        candidate_gen.ShapedType(
-            [2048, 512, 384], candidate_gen.ElementType.f8
-        ).bitwidth
-        == 8
-    )
-    assert (
-        candidate_gen.ShapedType([1, 1], candidate_gen.ElementType.f16).bitwidth == 16
-    )
-
-
-def test_get_shaped_type_to_str():
-    assert (
-        str(candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8))
-        == "1024x2048xi8"
-    )
-    assert (
-        str(candidate_gen.ShapedType([1024], candidate_gen.ElementType.f32))
-        == "1024xf32"
-    )
-    assert (
-        str(candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f16))
-        == "1x2x3xf16"
-    )
-    assert (
-        str(candidate_gen.ShapedType([-1, 2, 3], candidate_gen.ElementType.f16))
-        == "?x2x3xf16"
-    )
-
-
-def test_parse_tensor_type():
-    assert candidate_gen.parse_tensor_type(
-        "tensor<1x2x3xf32>"
-    ) == candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f32)
-    assert candidate_gen.parse_tensor_type(
-        "tensor<123xi8>"
-    ) == candidate_gen.ShapedType([123], candidate_gen.ElementType.i8)
-
-
-def test_get_mmt_tile_sizes():
-    config = candidate_gen.Configuration(
-        subgroup_size=0,
-        workgroup_size=[],
-        intrinsic="",
-        tile_sizes=[128, 320, 32],
-        subgroup_m_count=0,
-        subgroup_n_count=0,
-        waves_per_eu=0,
-    )
-    assert candidate_gen.get_mmt_tile_sizes(config) == [128, 320, 32]
-
-
-def test_get_conv_tile_sizes():
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[256, 1, 1],
-        intrinsic="#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>",
-        tile_sizes=[464, 320, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=4,
-        waves_per_eu=1,
-    )
-    assert candidate_gen.get_conv_tile_sizes(config) == [1, 1, 464, 320, 1, 1, 16]
-
-
-def test_get_contract_tile_sizes():
-    config = candidate_gen.Configuration(
-        subgroup_size=32,
-        workgroup_size=[16, 16, 1],
-        intrinsic="",
-        tile_sizes=[4, 8, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=1,
-        waves_per_eu=2,
-    )
-    assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16]
-    assert candidate_gen.get_contract_tile_sizes(config, ["n", "m", "k"]) == [8, 4, 16]
-    assert candidate_gen.get_contract_tile_sizes(config, ["k", "n", "m"]) == [16, 8, 4]
-    assert candidate_gen.get_contract_tile_sizes(config, ["k", "k", "k"]) == [
-        16,
-        16,
-        16,
-    ]
-
-
-def test_get_pipeline_config():
-    config1 = candidate_gen.Configuration(
-        subgroup_size=32,
-        workgroup_size=[16, 16, 1],
-        intrinsic="",
-        tile_sizes=[4, 8, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=1,
-        waves_per_eu=2,
-    )
-    config2 = candidate_gen.Configuration(
-        subgroup_size=32,
-        workgroup_size=[16, 16, 1],
-        intrinsic="",
-        tile_sizes=[4, 8, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=1,
-        waves_per_eu=4,
-    )
-    assert candidate_gen.get_pipeline_config(config1) == ", prefetch_shared_memory"
-    assert (
-        candidate_gen.get_pipeline_config(config2)
-        == ', prefetch_shared_memory, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'
-    )
-
-
-def test_get_shapes_mmt():
-    template = [
-        r"%18 = tensor.empty() : tensor<2048x1280xf32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} ins(%cst : f32) outs(%18 : tensor<2048x1280xf32>) -> tensor<2048x1280xf32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} {',
-        r"^bb0(%in: f16, %in_0: f16, %out: f32):",
-    ]
-    assert candidate_gen.get_shapes_mmt(template) == candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(2048, 1280, 1280),
-        candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.mmt,
-    )
-
-
-def test_get_shapes_conv():
-    template = [
-        r"%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 32, 256, 1, 1, 32]]>} ins(%cst : f32) outs(%4 : tensor<1x1x32x256xf32>) -> tensor<1x1x32x256xf32>",
-        r"%8 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 32, 256, 1, 1, 32]]>, strides = dense<1> : vector<2xi64>} ins(%5, %6 : tensor<1x3x34x1280xf16>, tensor<3x3x1280x256xf16>) outs(%7 : tensor<1x1x32x256xf32>) -> tensor<1x1x32x256xf32>",
-        r"flow.dispatch.tensor.store %8, %2, offsets = [%workgroup_id_z, %workgroup_id_y, 0, %3], sizes = [1, 1, 32, 256], strides = [1, 1, 1, 1] : tensor<1x1x32x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>",
-    ]
-    assert candidate_gen.get_shapes_conv(template) == candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(32, 256, 11520),
-        candidate_gen.ShapedType([1, 3, 34, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([3, 3, 1280, 256], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([1, 1, 32, 256], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.conv,
-    )
-
-
-def test_get_shapes_contract():
-    template = [
-        r"%18 = tensor.empty() : tensor<2048x1280xf32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} ins(%cst : f32) outs(%18 : tensor<2048x1280xf32>) -> tensor<2048x1280xf32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} {',
-        r"^bb0(%in: f16, %in_0: f16, %out: f32):",
-    ]
-    assert candidate_gen.get_shapes_contract(
-        template, "mk", "nk"
-    ) == candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(2048, 1280, 1280),
-        candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.contraction,
-    )
-
-
-def test_get_shapes_batch_matmul():
-    template = [
-        "%10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>",
-        "%11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>",
-        "flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x32x64xf32>>",
-    ]
-    assert candidate_gen.get_shapes_batch_matmul(
-        template, "bmk", "bkn"
-    ) == candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(32, 32, 1024, 1),
-        candidate_gen.ShapedType([1, 32, 1024], candidate_gen.ElementType.f32),
-        candidate_gen.ShapedType([1, 1024, 32], candidate_gen.ElementType.f32),
-        candidate_gen.ShapedType([1, 32, 32], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.batch_matmul,
-    )
-
-
-def test_get_shapes_batch_mmt():
-    template = [
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} ins(%c0_i32 : i32) outs(%18 : tensor<2x4096x640xi32>) -> tensor<2x4096x640xi32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x4096x640xi8>, tensor<2x640x640xi8>) outs(%19 : tensor<2x4096x640xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} {',
-        r"flow.dispatch.tensor.store %21, %10, offsets = [0, 0, 0], sizes = [2, 4096, 640], strides = [1, 1, 1] : tensor<2x4096x640xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x640xf16>>",
-    ]
-    assert candidate_gen.get_shapes_batch_mmt(template) == candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(4096, 640, 640, 2),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32),
-        candidate_gen.DispatchKind.batch_mmt,
-    )
-
-
-def test_mfma_intrinsic_to_str():
-    assert (
-        str(candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32())
-        == "MFMA_F16_16x16x16_F32"
-    )
-    assert (
-        str(candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32())
-        == "MFMA_I8_32x32x16_I32"
-    )
-
-
-def test_get_compatible_mfma_intrinsics():
-    assert candidate_gen.get_compatible_mfma_intrinsics(
-        candidate_gen.ProblemSize(
-            candidate_gen.MatmulSize(2048, 1280, 1280),
-            candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16),
-            candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16),
-            candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32),
-            candidate_gen.DispatchKind.mmt,
-        )
-    ) == [
-        candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(),
-        candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(),
-    ]
-
-    assert candidate_gen.get_compatible_mfma_intrinsics(
-        candidate_gen.ProblemSize(
-            candidate_gen.MatmulSize(2048, 1280, 1280),
-            candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.i8),
-            candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.i8),
-            candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.i32),
-            candidate_gen.DispatchKind.mmt,
-        )
-    ) == [
-        candidate_gen.MfmaIntrinsic.mfma_i8_16x16x32_i32(),
-        candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(),
-    ]
-
-    assert candidate_gen.get_compatible_mfma_intrinsics(
-        candidate_gen.ProblemSize(
-            candidate_gen.MatmulSize(968, 320, 640, 64),
-            candidate_gen.ShapedType([64, 968, 640], candidate_gen.ElementType.f32),
-            candidate_gen.ShapedType([64, 640, 320], candidate_gen.ElementType.f32),
-            candidate_gen.ShapedType([64, 968, 320], candidate_gen.ElementType.f32),
-            candidate_gen.DispatchKind.batch_matmul,
-        )
-    ) == [
-        candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(),
-        candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(),
-    ]
-
-
-def test_generate_solutions():
-    matmul_size = candidate_gen.MatmulSize(2048, 3840, 1280)
-    lhs_type = candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16)
-    rhs_type = candidate_gen.ShapedType([3840, 1280], candidate_gen.ElementType.f16)
-    res_type = candidate_gen.ShapedType([2048, 3840], candidate_gen.ElementType.f32)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    configs = candidate_gen.generate_solutions(problem_size, 4)
-    assert configs is not None
-
-
-def test_calculate_shared_memory_usage_in_bytes():
-    matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024)
-    lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    assert (
-        candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128)
-        == 147456
-    )
-
-    lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.i8)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    assert (
-        candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128)
-        == 81920
-    )
-
-    rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.i32)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    assert (
-        candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 128, 64, 32)
-        == 12288
-    )
-
-
-def test_generate_constraints_valid_input():
-    matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024)
-    lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    # Define input parameters as z3 Ints
-    m, n, k = (
-        candidate_gen.z3.Int("m"),
-        candidate_gen.z3.Int("n"),
-        candidate_gen.z3.Int("k"),
-    )
-    subgroup_size = candidate_gen.z3.Int("subgroup_size")
-    intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn")
-    intrinsic_k = candidate_gen.z3.Int("intrinsic_k")
-    wg_x, wg_y, wg_z = (
-        candidate_gen.z3.Int("wg_x"),
-        candidate_gen.z3.Int("wg_y"),
-        candidate_gen.z3.Int("wg_z"),
-    )
-    sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt")
-    sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt")
-    waves_per_eu = candidate_gen.z3.Int("waves_per_eu")
-
-    constraints = candidate_gen.generate_constraints(
-        problem_size,
-        [m, n, k],
-        4,
-        subgroup_size,
-        [intrinsic_mn, intrinsic_k],
-        [wg_x, wg_y, wg_z],
-        sg_m_cnt,
-        sg_n_cnt,
-        waves_per_eu,
-    )
-
-    solver = candidate_gen.z3.Solver()
-    solver.add(constraints)
-
-    # Check if the constraints are satisfiable
-    assert solver.check() == candidate_gen.z3.sat
-
-
-def test_generate_constraints_invalid_input():
-    # Define input parameters that should lead to unsatisfiable constraints
-    matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024)
-    lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16)
-    res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32)
-    problem_size = candidate_gen.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt
-    )
-    m, n, k = (
-        candidate_gen.z3.Int("m"),
-        candidate_gen.z3.Int("n"),
-        candidate_gen.z3.Int("k"),
-    )
-    subgroup_size = candidate_gen.z3.Int("subgroup_size")
-    intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn")
-    intrinsic_k = candidate_gen.z3.Int("intrinsic_k")
-    wg_x, wg_y, wg_z = (
-        candidate_gen.z3.Int("wg_x"),
-        candidate_gen.z3.Int("wg_y"),
-        candidate_gen.z3.Int("wg_z"),
-    )
-    sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt")
-    sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt")
-    waves_per_eu = candidate_gen.z3.Int("waves_per_eu")
-
-    constraints = candidate_gen.generate_constraints(
-        problem_size,
-        [m, n, k],
-        4,
-        subgroup_size,
-        [intrinsic_mn, intrinsic_k],
-        [wg_x, wg_y, wg_z],
-        sg_m_cnt,
-        sg_n_cnt,
-        waves_per_eu,
-    )
-    constraints.append(m > 1000)  # Adding an additional unsatisfiable constraint
-
-    solver = candidate_gen.z3.Solver()
-    solver.add(constraints)
-
-    # Check if the constraints are unsatisfiable
-    assert solver.check() == candidate_gen.z3.unsat
-
-
-def test_apply_params_mmt():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>, subgroup_m_count = 16, subgroup_n_count = 16>",
-        "<LLVMGPUVectorDistribute workgroup_size = [16, 16] subgroup_size = 16,",
-        "<tile_sizes = [[8, 8, 8]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
-    ]
-
-    M, N, K = 2048, 1280, 1280
-
-    config = candidate_gen.Configuration(
-        subgroup_size=16,
-        workgroup_size=[16, 16, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(),
-        tile_sizes=[8, 8, 8],
-        subgroup_m_count=16,
-        subgroup_n_count=16,
-        waves_per_eu=8,
-    )
-
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(M, N, K),
-        candidate_gen.ShapedType([M, K], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([N, K], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([M, N], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.mmt,
-    )
-    modified, embeddable = candidate_gen.apply_params_mmt(
-        problem_size, mlir_template, config
-    )
-
-    assert modified
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 16, subgroup_n_count = 16"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [16, 16, 1] subgroup_size = 16"
-        in modified
-    )
-    assert "tile_sizes = [[8, 8, 8]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "8"}' in modified
-
-
-def test_apply_params_conv():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 16, subgroup_n_count = 16>",
-        "<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 1, 64, 128, 1, 1, 32]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
-    ]
-
-    n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 640
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[256, 1, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(),
-        tile_sizes=[464, 320, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=4,
-        waves_per_eu=2,
-    )
-
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(oh * ow, oc, fh * fw * ic),
-        candidate_gen.ShapedType(
-            [n, oh + 2, ow + 2, oc], candidate_gen.ElementType.f16
-        ),
-        candidate_gen.ShapedType([fh, fw, ic, oc], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([n, oh, ow, oc], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.conv,
-    )
-    modified, embeddable = candidate_gen.apply_params_conv(
-        problem_size, mlir_template, config
-    )
-
-    assert modified
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "tile_sizes = [[1, 1, 464, 320, 1, 1, 16]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_contract():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 1, 1, 64, 64, 128]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    tile_dims = "*mnk"
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(2048, 3840, 1280),
-        candidate_gen.ShapedType([2, 1024, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([3, 20, 64, 1280], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([3, 2, 20, 1024, 64], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.contraction,
-    )
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[256, 1, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(),
-        tile_sizes=[480, 384, 32],
-        subgroup_m_count=1,
-        subgroup_n_count=4,
-        waves_per_eu=2,
-    )
-
-    new_mlir, _embeddable = candidate_gen.apply_params_contract(
-        problem_size, tile_dims, mlir_template, config
-    )
-
-    assert new_mlir
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>, subgroup_m_count = 1, subgroup_n_count = 4"
-        in new_mlir
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64"
-        in new_mlir
-    )
-    assert "tile_sizes = [[1, 480, 384, 32]]" in new_mlir
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in new_mlir
-
-
-def test_apply_params_batch_matmul():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 128, 64, 64]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    tile_dims = "bmnk"
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(968, 320, 640, 64),
-        candidate_gen.ShapedType([64, 968, 640], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([64, 640, 320], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([64, 968, 320], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.batch_matmul,
-    )
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[128, 2, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(),
-        tile_sizes=[416, 320, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-        waves_per_eu=2,
-    )
-
-    modified, embeddable = candidate_gen.apply_params_batch_matmul(
-        problem_size, tile_dims, mlir_template, config
-    )
-
-    assert modified
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "tile_sizes = [[1, 416, 320, 128]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_batch_mmt_float():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 128, 128, 64]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(4096, 640, 640, 2),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.f16),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.f32),
-        candidate_gen.DispatchKind.batch_mmt,
-    )
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[128, 2, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(),
-        tile_sizes=[128, 64, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-        waves_per_eu=2,
-    )
-
-    modified, embeddable = candidate_gen.apply_params_batch_mmt(
-        problem_size, mlir_template, config
-    )
-
-    assert embeddable
-    assert modified
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "tile_sizes = [[1, 128, 64, 128]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_batch_mmt_int():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_I8_16x16x32_I32>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 128, 128, 64]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(4096, 640, 640, 2),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32),
-        candidate_gen.DispatchKind.batch_mmt,
-    )
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[128, 2, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(),
-        tile_sizes=[128, 64, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-        waves_per_eu=4,
-    )
-
-    modified, embeddable = candidate_gen.apply_params_batch_mmt(
-        problem_size, mlir_template, config
-    )
-
-    assert modified
-    assert "//   transform.named_sequence @match_batch_mmt_2x4096x640x640(" in modified
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_I8_32x32x16_I32>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "tile_sizes = [[1, 128, 64, 128]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in modified
-
-    assert embeddable
-    assert "transform.named_sequence @match_op(" in embeddable
-    assert (
-        "transform.include @match_batch_mmt_i8_i8_i32 failures(propagate)" in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %lhs = tensor<2x4096x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %rhs = tensor<2x640x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "%config = transform.param.constant #iree_codegen.compilation_info<"
-        in embeddable
-    )
-    assert "tile_sizes = [[1, 128, 64, 128]]" in embeddable
-    assert 'llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in embeddable
-    assert "workgroup_size = [128, 2, 1] subgroup_size = 64" in embeddable
-
-
-def test_apply_params_broadcast_rhs_mmt():
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_I8_16x16x32_I32>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<tile_sizes = [[1, 128, 128, 64]]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = candidate_gen.ProblemSize(
-        candidate_gen.MatmulSize(4096, 640, 640, 2),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([640, 640], candidate_gen.ElementType.i8),
-        candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32),
-        candidate_gen.DispatchKind.broadcast_rhs_mmt,
-    )
-
-    config = candidate_gen.Configuration(
-        subgroup_size=64,
-        workgroup_size=[128, 2, 1],
-        intrinsic=candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(),
-        tile_sizes=[128, 64, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-        waves_per_eu=4,
-    )
-
-    modified, embeddable = candidate_gen.apply_params_broadcast_rhs_mmt(
-        problem_size, mlir_template, config
-    )
-
-    assert modified
-    assert (
-        "//   transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x640x640("
-        in modified
-    )
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_I8_32x32x16_I32>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "tile_sizes = [[1, 128, 64, 128]]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in modified
-
-    assert embeddable
-    assert "transform.named_sequence @match_op(" in embeddable
-    assert (
-        "transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate)"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %lhs = tensor<?x4096x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %rhs = tensor<640x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "%config = transform.param.constant #iree_codegen.compilation_info<"
-        in embeddable
-    )
-    assert "tile_sizes = [[1, 128, 64, 128]]" in embeddable
-    assert 'llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in embeddable
-    assert "workgroup_size = [128, 2, 1] subgroup_size = 64" in embeddable
-
-
-def test_detect_broadcast_rhs_mmt():
-    mlir_lines = [
-        r"%18 = tensor.empty() : tensor<2x1024x10240xi32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} ins(%c0_i32 : i32) outs(%18 : tensor<2x1024x10240xi32>) -> tensor<2x1024x10240xi32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x1024x1280xi8>, tensor<10240x1280xi8>) outs(%19 : tensor<2x1024x10240xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} {',
-    ]
-    assert candidate_gen.is_broadcast_rhs_mmt(mlir_lines)
-
-
-def test_parse_mlir():
-    mlir_str = r"""
-    builtin.module  {
-      func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-        %0 = arith.mulf %arg0, %arg1 : tensor<4xf32>
-        return %0 : tensor<4xf32>
-      }
-    }
-  """
-    mlir_module = candidate_gen.parse_mlir(mlir_str)
-    assert mlir_module != None
-    assert isinstance(mlir_module, candidate_gen.ireec._mlir_libs._mlir.ir.Module)
-    assert isinstance(
-        mlir_module.body.operations[0], candidate_gen.ireec.dialects.func.FuncOp
-    )
diff --git a/tuner-requirements.txt b/tuner-requirements.txt
new file mode 100644
index 000000000..26ef8f6c7
--- /dev/null
+++ b/tuner-requirements.txt
@@ -0,0 +1,3 @@
+tqdm==4.66.4
+types-tqdm==4.66.0.20240417
+z3_solver==4.13.0.0