From 1dfcdb2aa75848a442fcbc9ee3e8e503e5036f4b Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Thu, 5 Sep 2024 16:27:15 -0500 Subject: [PATCH] Fix timeouts. Update sample commands. --- tuner/examples/punet/README.md | 6 +++--- tuner/examples/punet/punet_autotune.py | 21 +++++++-------------- tuner/libtuner.py | 24 +++++++++++++++--------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md index ed5490121..f478c8695 100644 --- a/tuner/examples/punet/README.md +++ b/tuner/examples/punet/README.md @@ -31,16 +31,16 @@ cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchma ### Recommended Trial Run For an initial trial to test the tuning loop, use: ```shell -python punet_autotune.py test-benchmark.mlir --num-candidates=10 +python -m tuner.examples.punet.punet_autotune test-benchmark.mlir --num-candidates=10 ``` ### Dry Run Test To perform a dry run (no GPU required), use: ```shell -python punet_autotune.py test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run +python -m tuner.examples.punet.punet_autotune test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run ``` ### Basic Usage ```shell -python punet_autotune.py test-benchmark.mlir +python -m tuner.examples.punet.punet_autotune test-benchmark.mlir ``` diff --git a/tuner/examples/punet/punet_autotune.py b/tuner/examples/punet/punet_autotune.py index 49cd3530c..9fa20cb52 100644 --- a/tuner/examples/punet/punet_autotune.py +++ b/tuner/examples/punet/punet_autotune.py @@ -7,17 +7,17 @@ """ Sample Usage: -python punet_autotune.py 2.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64 +python -m tuner.examples.punet.punet_autotune benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64 Recommended Trial Run: -python punet_autotune.py 2.mlir --num-candidates=1 +python -m tuner.examples.punet.punet_autotune benchmark.mlir --num-candidates=1 Dry Run Test (no gpu requried): -python punet_autotune.py 2.mlir --num-candidates=64 --num-model-candidates=10 --dry-run +python -m tuner.examples.punet.punet_autotune benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run """ @@ -35,7 +35,7 @@ def get_dispatch_compile_command( mlir_path = candidate_tracker.dispatch_mlir_path assert mlir_path is not None command = [ - "./compile_candidate.sh", + "compile_candidate.sh", mlir_path.as_posix(), ] return command @@ -51,9 +51,7 @@ def get_dispatch_benchmark_command( assert compiled_vmfb_path is not None command = [ - "timeout", - "16s", - "./iree-benchmark-module", + "iree-benchmark-module", f"--device={libtuner.DEVICE_ID_PLACEHOLDER}", f"--module={compiled_vmfb_path.resolve()}", "--hip_use_streams=true", @@ -74,14 +72,11 @@ def get_model_compile_command( ) -> list[str]: mlir_spec_path = candidate_tracker.spec_path assert mlir_spec_path is not None - script_dir = Path(__file__).resolve().parent target_dir = mlir_spec_path.resolve().parent.parent.parent output_name = f"unet_candidate_{candidate_tracker.candidate_id}.vmfb" command = [ - "timeout", - "300s", - "./compile-punet-base.sh", - "./iree-compile", + "compile-punet-base.sh", + "iree-compile", "gfx942", f"{mlir_spec_path.resolve()}", "./punet.mlir", @@ -100,8 +95,6 @@ def get_model_benchmark_command( assert unet_candidate_path is not None command = [ - "timeout", - "180s", "iree-benchmark-module", f"--device={libtuner.DEVICE_ID_PLACEHOLDER}", "--hip_use_streams=true", diff --git a/tuner/libtuner.py b/tuner/libtuner.py index 300bd5a2a..396b535f1 100644 --- a/tuner/libtuner.py +++ b/tuner/libtuner.py @@ -200,7 +200,7 @@ def get_model_benchmark_timeout_s(self) -> int: class RunPack: command: list[str] check: bool = True - timeout: Optional[int] = None + timeout_seconds: Optional[int] = None @dataclass @@ -523,7 +523,7 @@ def create_worker_context_queue(device_ids: list[int]) -> queue.Queue[tuple[int, def run_command(run_pack: RunPack) -> TaskResult: command = run_pack.command check = run_pack.check - timeout = run_pack.timeout + timeout_seconds = run_pack.timeout result = None is_timeout = False @@ -534,7 +534,11 @@ def run_command(run_pack: RunPack) -> TaskResult: # Add timeout to subprocess.run call result = subprocess.run( - command, check=check, capture_output=True, text=True, timeout=timeout + command, + check=check, + capture_output=True, + text=True, + timeout=timeout_seconds, ) if result.stdout: @@ -542,7 +546,9 @@ def run_command(run_pack: RunPack) -> TaskResult: if result.stderr: logging.debug(f"stderr: {result.stderr}") except subprocess.TimeoutExpired as e: - logging.warning(f"Command '{command_str}' timed out after {timeout} seconds.") + logging.warning( + f"Command '{command_str}' timed out after {timeout_seconds} seconds." + ) is_timeout = True except subprocess.CalledProcessError as e: print(e.output) @@ -811,7 +817,7 @@ def compile_dispatches( candidate_trackers[i] ), check=False, - timeout=tuning_client.get_dispatch_compile_timeout_s(), + timeout_seconds=tuning_client.get_dispatch_compile_timeout_s(), ), candidate_id=i, ) @@ -991,7 +997,7 @@ def benchmark_dispatches( candidate_trackers[i] ), check=False, - timeout=tuning_client.get_dispatch_benchmark_timeout_s(), + timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(), ), candidate_id=i, command_need_device_id=True, @@ -1071,7 +1077,7 @@ def compile_models( RunPack( command=tuning_client.get_model_compile_command(candidate_trackers[i]), check=False, - timeout=tuning_client.get_model_compile_timeout_s(), + timeout_seconds=tuning_client.get_model_compile_timeout_s(), ), candidate_id=i, ) @@ -1272,7 +1278,7 @@ def benchmark_models( candidate_trackers[i] ), check=False, - timeout=tuning_client.get_dispatch_benchmark_timeout_s(), + timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(), ), candidate_id=i, command_need_device_id=True, @@ -1298,7 +1304,7 @@ def benchmark_models( candidate_trackers[0] ), check=False, - timeout=tuning_client.get_model_benchmark_timeout_s(), + timeout_seconds=tuning_client.get_model_benchmark_timeout_s(), ), candidate_id=0, command_need_device_id=True,