Skip to content

Commit

Permalink
Clean up punet example
Browse files Browse the repository at this point in the history
  • Loading branch information
kuhar committed Sep 5, 2024
1 parent 29b20fe commit 3a287a6
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 173 deletions.
3 changes: 3 additions & 0 deletions tuner/examples/punet/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Test files/dirs recommended by README.md.
dump-mmt
test-benchmark.mlir
80 changes: 0 additions & 80 deletions tuner/examples/punet/2.mlir

This file was deleted.

31 changes: 17 additions & 14 deletions tuner/examples/punet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,42 @@ Follow instructions in [`/tuner/README.md`](../README.md)

## Shell Scripts

The required shell scripts can be downloaded from: [sdxl-scripts](https://github.com/nod-ai/sdxl-scripts)
The required shell scripts can be downloaded from:
[sdxl-scripts](https://github.com/nod-ai/sdxl-scripts).

These scripts include:
1. `compile-punet-base.sh` - Used for compiling model candidates.
2. `compile_candidate.sh` - Used for compiling dispatch candidates.
3. `punet.sh` - Invoked by `compile_candidate.sh`.

Please configure the file paths and update commands in `PunetClient`.
**Note:** Alternatively, add these scripts to your `PATH` environment variable
Add the parent directories of these scripts to your `PATH` environment variable,
so that they can be picked up by `punet_autotune.py`.

## Running the Tuner

### [Optional] Generate a tunable mlir
A sample `2.mlir` is provided for test run. Hoever, this file may become outdated if IREE makes changes to the MLIR format. To ensure you are working with the latest format, please follow the instructions below to compile and generate the most recent benchmark file.
Use [`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh) to compile the sample matmul `mmt.mlir` (can also find here: [`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
```
./punet.sh ./mmt.mlir -o baseline.vmfb --iree-hal-dump-executable-files-to=dump-mmt
cp ./dump-mmt/module_main_2_dispatch_0_rocm_hsaco_fb_benchmark.mlir ./2.mlir
Use
[`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh)
to compile the sample matmul `mmt.mlir` (can also find here:
[`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
```shell
punet.sh mmt.mlir -o mmt.vmfb --iree-hal-dump-executable-files-to=dump-mmt
cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchmark.mlir
```

### Recommended Trial Run
For an initial trial to test the tuning loop, use:
```
python punet_autotune.py 2.mlir --num-candidates=1
```shell
python punet_autotune.py test-benchmark.mlir --num-candidates=10
```

### Dry Run Test
To perform a dry run (no GPU required), use:
```
python punet_autotune.py 2.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
```shell
python punet_autotune.py test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
```

### Basic Usage
```
python punet_autotune.py 2.mlir
```shell
python punet_autotune.py test-benchmark.mlir
```
65 changes: 0 additions & 65 deletions tuner/examples/punet/mmt.mlir
Original file line number Diff line number Diff line change
@@ -1,20 +1,3 @@
// RUN: iree-compile --iree-hal-target-backends=rocm --iree-rocm-target-chip=gfx942 \
// RUN: --iree-rocm-link-bc=true --iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode \
// RUN: --iree-global-opt-propagate-transposes=true --iree-opt-outer-dim-concat=true \
// RUN: --iree-opt-const-eval=false --iree-codegen-gpu-native-math-precision=true --iree-rocm-waves-per-eu=2 \
// RUN: --iree-preprocessing-pass-pipeline='builtin.module(iree-preprocessing-transpose-convolution-pipeline)' \
// RUN: --iree-codegen-llvmgpu-use-vector-distribution --iree-codegen-transform-dialect-library=config.mlir \
// RUN: %s -o %s.vmfb

// To compile to for benchmarking, add:
// --iree-flow-export-benchmark-funcs --iree-hal-benchmark-dispatch-repeat-count=1000
//
// To benchmark:
// for i in {0..4} ; do
// iree-benchmark-module --device=rocm://7 --module=%s.vmfb --function="main_${i}_benchmark" --device_allocator=caching \
// --batch_size=1000 --benchmark_repetitions=5
// done

!matA_0 = tensor<2048x1280xf16>
!matB_0 = tensor<10240x1280xf16>
!matC_0 = tensor<2048x10240xf32>
Expand All @@ -26,51 +9,3 @@ func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
%8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
return %8 : !matC_0
}

!matA_1 = tensor<2048x1280xf16>
!matB_1 = tensor<1280x1280xf16>
!matC_1 = tensor<2048x1280xf32>

func.func @main_1(%arg0: !matA_1, %arg1: !matB_1) -> !matC_1 {
%cst = arith.constant 0.000000e+00 : f16
%5 = tensor.empty() : !matC_1
%6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_1) -> !matC_1
%8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_1, !matB_1) outs(%6 : !matC_1) -> !matC_1
return %8 : !matC_1
}

!matA_2 = tensor<2048x5120xf16>
!matB_2 = tensor<1280x5120xf16>
!matC_2 = tensor<2048x1280xf32>

func.func @main_2(%arg0: !matA_2, %arg1: !matB_2) -> !matC_2 {
%cst = arith.constant 0.000000e+00 : f16
%5 = tensor.empty() : !matC_2
%6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_2) -> !matC_2
%8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_2, !matB_2) outs(%6 : !matC_2) -> !matC_2
return %8 : !matC_2
}

!matA_3 = tensor<128x2048xf16>
!matB_3 = tensor<1280x2048xf16>
!matC_3 = tensor<128x1280xf32>

func.func @main_3(%arg0: !matA_3, %arg1: !matB_3) -> !matC_3 {
%cst = arith.constant 0.000000e+00 : f16
%5 = tensor.empty() : !matC_3
%6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_3) -> !matC_3
%8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_3, !matB_3) outs(%6 : !matC_3) -> !matC_3
return %8 : !matC_3
}

!matA_4 = tensor<8192x640xf16>
!matB_4 = tensor<5120x640xf16>
!matC_4 = tensor<8192x5120xf32>

func.func @main_4(%arg0: !matA_4, %arg1: !matB_4) -> !matC_4 {
%cst = arith.constant 0.000000e+00 : f16
%5 = tensor.empty() : !matC_4
%6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_4) -> !matC_4
%8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_4, !matB_4) outs(%6 : !matC_4) -> !matC_4
return %8 : !matC_4
}
Empty file modified tuner/libtuner.py
100755 → 100644
Empty file.
19 changes: 5 additions & 14 deletions tuner/libtuner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,20 +120,11 @@ def test_parse_dispatch_benchmark_results():
mock_result_3.candidate_id = 3
benchmark_results = [mock_result_1, mock_result_2, mock_result_3]

candidate_tracker_0 = libtuner.CandidateTracker(candidate_id=0)
candidate_tracker_0.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/0.mlir")
candidate_tracker_1 = libtuner.CandidateTracker(candidate_id=1)
candidate_tracker_1.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/1.mlir")
candidate_tracker_2 = libtuner.CandidateTracker(candidate_id=2)
candidate_tracker_2.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/2.mlir")
candidate_tracker_3 = libtuner.CandidateTracker(candidate_id=3)
candidate_tracker_3.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/3.mlir")
candidate_trackers = [
candidate_tracker_0,
candidate_tracker_1,
candidate_tracker_2,
candidate_tracker_3,
]
candidate_trackers = []
for i in range(4):
tracker = libtuner.CandidateTracker(candidate_id=i)
tracker.dispatch_mlir_path = libtuner.Path(f"/mock/mlir/path/{i}.mlir")
candidate_trackers.append(tracker)

expected_parsed_results = [
libtuner.ParsedDisptachBenchmarkResult(
Expand Down

0 comments on commit 3a287a6

Please sign in to comment.