Clean up punet example

nod-ai · Sep 5, 2024 · 3a287a6 · 3a287a6
1 parent 29b20fe
commit 3a287a6
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 173 deletions.
diff --git a/tuner/examples/punet/.gitignore b/tuner/examples/punet/.gitignore
@@ -0,0 +1,3 @@
+# Test files/dirs recommended by README.md.
+dump-mmt
+test-benchmark.mlir
diff --git a/tuner/examples/punet/2.mlir b/tuner/examples/punet/2.mlir
diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md
@@ -5,39 +5,42 @@ Follow instructions in [`/tuner/README.md`](../README.md)
 
 ## Shell Scripts
 
-The required shell scripts can be downloaded from: [sdxl-scripts](https://github.com/nod-ai/sdxl-scripts)
+The required shell scripts can be downloaded from:
+[sdxl-scripts](https://github.com/nod-ai/sdxl-scripts).
 
 These scripts include:
 1. `compile-punet-base.sh` - Used for compiling model candidates.
 2. `compile_candidate.sh` - Used for compiling dispatch candidates.
 3. `punet.sh` - Invoked by `compile_candidate.sh`.
 
-Please configure the file paths and update commands in `PunetClient`.
-**Note:** Alternatively, add these scripts to your `PATH` environment variable
+Add the parent directories of these scripts to your `PATH` environment variable,
+so that they can be picked up by `punet_autotune.py`.
 
 ## Running the Tuner
 
 ### [Optional] Generate a tunable mlir
-A sample `2.mlir` is provided for test run. Hoever, this file may become outdated if IREE makes changes to the MLIR format. To ensure you are working with the latest format, please follow the instructions below to compile and generate the most recent benchmark file.
-Use [`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh) to compile the sample matmul `mmt.mlir` (can also find here: [`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
-```
-./punet.sh ./mmt.mlir -o baseline.vmfb --iree-hal-dump-executable-files-to=dump-mmt
-cp ./dump-mmt/module_main_2_dispatch_0_rocm_hsaco_fb_benchmark.mlir ./2.mlir
+Use
+[`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh)
+to compile the sample matmul `mmt.mlir` (can also find here:
+[`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
+```shell
+punet.sh mmt.mlir -o mmt.vmfb --iree-hal-dump-executable-files-to=dump-mmt
+cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchmark.mlir
 ```
 
 ### Recommended Trial Run
 For an initial trial to test the tuning loop, use:
-```
-python punet_autotune.py 2.mlir --num-candidates=1
+```shell
+python punet_autotune.py test-benchmark.mlir --num-candidates=10
 ```
 
 ### Dry Run Test
 To perform a dry run (no GPU required), use:
-```
-python punet_autotune.py 2.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+```shell
+python punet_autotune.py test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
 ```
 
 ### Basic Usage
-```
-python punet_autotune.py 2.mlir
+```shell
+python punet_autotune.py test-benchmark.mlir
 ```
diff --git a/tuner/examples/punet/mmt.mlir b/tuner/examples/punet/mmt.mlir
@@ -1,20 +1,3 @@
-// RUN: iree-compile --iree-hal-target-backends=rocm --iree-rocm-target-chip=gfx942 \
-// RUN:   --iree-rocm-link-bc=true --iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode \
-// RUN:   --iree-global-opt-propagate-transposes=true --iree-opt-outer-dim-concat=true \
-// RUN:   --iree-opt-const-eval=false --iree-codegen-gpu-native-math-precision=true --iree-rocm-waves-per-eu=2 \
-// RUN:   --iree-preprocessing-pass-pipeline='builtin.module(iree-preprocessing-transpose-convolution-pipeline)' \
-// RUN:   --iree-codegen-llvmgpu-use-vector-distribution --iree-codegen-transform-dialect-library=config.mlir \
-// RUN:   %s -o %s.vmfb
-
-// To compile to for benchmarking, add:
-//  --iree-flow-export-benchmark-funcs --iree-hal-benchmark-dispatch-repeat-count=1000
-//
-// To benchmark:
-//   for i in {0..4} ; do
-//     iree-benchmark-module --device=rocm://7 --module=%s.vmfb --function="main_${i}_benchmark" --device_allocator=caching \
-//       --batch_size=1000 --benchmark_repetitions=5
-//   done
-
 !matA_0 = tensor<2048x1280xf16>
 !matB_0 = tensor<10240x1280xf16>
 !matC_0 = tensor<2048x10240xf32>
@@ -26,51 +9,3 @@ func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
   %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
   return %8 : !matC_0
 }
-
-!matA_1 = tensor<2048x1280xf16>
-!matB_1 = tensor<1280x1280xf16>
-!matC_1 = tensor<2048x1280xf32>
-
-func.func @main_1(%arg0: !matA_1, %arg1: !matB_1) -> !matC_1 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_1
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_1) -> !matC_1
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_1, !matB_1) outs(%6 : !matC_1) -> !matC_1
-  return %8 : !matC_1
-}
-
-!matA_2 = tensor<2048x5120xf16>
-!matB_2 = tensor<1280x5120xf16>
-!matC_2 = tensor<2048x1280xf32>
-
-func.func @main_2(%arg0: !matA_2, %arg1: !matB_2) -> !matC_2 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_2
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_2) -> !matC_2
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_2, !matB_2) outs(%6 : !matC_2) -> !matC_2
-  return %8 : !matC_2
-}
-
-!matA_3 = tensor<128x2048xf16>
-!matB_3 = tensor<1280x2048xf16>
-!matC_3 = tensor<128x1280xf32>
-
-func.func @main_3(%arg0: !matA_3, %arg1: !matB_3) -> !matC_3 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_3
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_3) -> !matC_3
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_3, !matB_3) outs(%6 : !matC_3) -> !matC_3
-  return %8 : !matC_3
-}
-
-!matA_4 = tensor<8192x640xf16>
-!matB_4 = tensor<5120x640xf16>
-!matC_4 = tensor<8192x5120xf32>
-
-func.func @main_4(%arg0: !matA_4, %arg1: !matB_4) -> !matC_4 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_4
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_4) -> !matC_4
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_4, !matB_4) outs(%6 : !matC_4) -> !matC_4
-  return %8 : !matC_4
-}
diff --git a/tuner/libtuner.py b/tuner/libtuner.py
diff --git a/tuner/libtuner_test.py b/tuner/libtuner_test.py
@@ -120,20 +120,11 @@ def test_parse_dispatch_benchmark_results():
     mock_result_3.candidate_id = 3
     benchmark_results = [mock_result_1, mock_result_2, mock_result_3]
 
-    candidate_tracker_0 = libtuner.CandidateTracker(candidate_id=0)
-    candidate_tracker_0.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/0.mlir")
-    candidate_tracker_1 = libtuner.CandidateTracker(candidate_id=1)
-    candidate_tracker_1.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/1.mlir")
-    candidate_tracker_2 = libtuner.CandidateTracker(candidate_id=2)
-    candidate_tracker_2.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/2.mlir")
-    candidate_tracker_3 = libtuner.CandidateTracker(candidate_id=3)
-    candidate_tracker_3.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/3.mlir")
-    candidate_trackers = [
-        candidate_tracker_0,
-        candidate_tracker_1,
-        candidate_tracker_2,
-        candidate_tracker_3,
-    ]
+    candidate_trackers = []
+    for i in range(4):
+        tracker = libtuner.CandidateTracker(candidate_id=i)
+        tracker.dispatch_mlir_path = libtuner.Path(f"/mock/mlir/path/{i}.mlir")
+        candidate_trackers.append(tracker)
 
     expected_parsed_results = [
         libtuner.ParsedDisptachBenchmarkResult(