Fix timeouts. Update sample commands.

nod-ai · Sep 5, 2024 · 7adce63 · 7adce63
1 parent a6f3c5a
commit 7adce63
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 26 deletions.
diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md
@@ -31,16 +31,16 @@ cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchma
 ### Recommended Trial Run
 For an initial trial to test the tuning loop, use:
 ```shell
-python punet_autotune.py test-benchmark.mlir --num-candidates=10
+python -m tuner.examples.punet.punet_autotune test-benchmark.mlir --num-candidates=10
 ```
 
 ### Dry Run Test
 To perform a dry run (no GPU required), use:
 ```shell
-python punet_autotune.py test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+python -m tuner.examples.punet.punet_autotune test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
 ```
 
 ### Basic Usage
 ```shell
-python punet_autotune.py test-benchmark.mlir
+python -m tuner.examples.punet.punet_autotune test-benchmark.mlir
 ```
diff --git a/tuner/examples/punet/punet_autotune.py b/tuner/examples/punet/punet_autotune.py
@@ -7,17 +7,17 @@
 """
 Sample Usage:
 
-python punet_autotune.py 2.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64
+python -m tuner.examples.punet.punet_autotune benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64
 
 
 Recommended Trial Run:
 
-python punet_autotune.py 2.mlir --num-candidates=1
+python -m tuner.examples.punet.punet_autotune benchmark.mlir --num-candidates=1
 
 
 Dry Run Test (no gpu requried):
 
-python punet_autotune.py 2.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+python -m tuner.examples.punet.punet_autotune benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
 
 """
 
@@ -35,7 +35,7 @@ def get_dispatch_compile_command(
         mlir_path = candidate_tracker.dispatch_mlir_path
         assert mlir_path is not None
         command = [
-            "./compile_candidate.sh",
+            "compile_candidate.sh",
             mlir_path.as_posix(),
         ]
         return command
@@ -51,9 +51,7 @@ def get_dispatch_benchmark_command(
         assert compiled_vmfb_path is not None
 
         command = [
-            "timeout",
-            "16s",
-            "./iree-benchmark-module",
+            "iree-benchmark-module",
             f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
             f"--module={compiled_vmfb_path.resolve()}",
             "--hip_use_streams=true",
@@ -74,14 +72,11 @@ def get_model_compile_command(
     ) -> list[str]:
         mlir_spec_path = candidate_tracker.spec_path
         assert mlir_spec_path is not None
-        script_dir = Path(__file__).resolve().parent
         target_dir = mlir_spec_path.resolve().parent.parent.parent
         output_name = f"unet_candidate_{candidate_tracker.candidate_id}.vmfb"
         command = [
-            "timeout",
-            "300s",
-            "./compile-punet-base.sh",
-            "./iree-compile",
+            "compile-punet-base.sh",
+            "iree-compile",
             "gfx942",
             f"{mlir_spec_path.resolve()}",
             "./punet.mlir",
@@ -100,8 +95,6 @@ def get_model_benchmark_command(
         assert unet_candidate_path is not None
 
         command = [
-            "timeout",
-            "180s",
             "iree-benchmark-module",
             f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
             "--hip_use_streams=true",

diff --git a/tuner/libtuner.py b/tuner/libtuner.py
@@ -200,7 +200,7 @@ def get_model_benchmark_timeout_s(self) -> int:
 class RunPack:
     command: list[str]
     check: bool = True
-    timeout: Optional[int] = None
+    timeout_seconds: Optional[int] = None
 
 
 @dataclass
@@ -523,7 +523,7 @@ def create_worker_context_queue(device_ids: list[int]) -> queue.Queue[tuple[int,
 def run_command(run_pack: RunPack) -> TaskResult:
     command = run_pack.command
     check = run_pack.check
-    timeout = run_pack.timeout
+    timeout_seconds = run_pack.timeout
 
     result = None
     is_timeout = False
@@ -534,15 +534,21 @@ def run_command(run_pack: RunPack) -> TaskResult:
 
         # Add timeout to subprocess.run call
         result = subprocess.run(
-            command, check=check, capture_output=True, text=True, timeout=timeout
+            command,
+            check=check,
+            capture_output=True,
+            text=True,
+            timeout=timeout_seconds,
         )
 
         if result.stdout:
             logging.debug(f"stdout: {result.stdout}")
         if result.stderr:
             logging.debug(f"stderr: {result.stderr}")
     except subprocess.TimeoutExpired as e:
-        logging.warning(f"Command '{command_str}' timed out after {timeout} seconds.")
+        logging.warning(
+            f"Command '{command_str}' timed out after {timeout_seconds} seconds."
+        )
         is_timeout = True
     except subprocess.CalledProcessError as e:
         print(e.output)
@@ -811,7 +817,7 @@ def compile_dispatches(
                     candidate_trackers[i]
                 ),
                 check=False,
-                timeout=tuning_client.get_dispatch_compile_timeout_s(),
+                timeout_seconds=tuning_client.get_dispatch_compile_timeout_s(),
             ),
             candidate_id=i,
         )
@@ -991,7 +997,7 @@ def benchmark_dispatches(
                         candidate_trackers[i]
                     ),
                     check=False,
-                    timeout=tuning_client.get_dispatch_benchmark_timeout_s(),
+                    timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(),
                 ),
                 candidate_id=i,
                 command_need_device_id=True,
@@ -1071,7 +1077,7 @@ def compile_models(
             RunPack(
                 command=tuning_client.get_model_compile_command(candidate_trackers[i]),
                 check=False,
-                timeout=tuning_client.get_model_compile_timeout_s(),
+                timeout_seconds=tuning_client.get_model_compile_timeout_s(),
             ),
             candidate_id=i,
         )
@@ -1272,7 +1278,7 @@ def benchmark_models(
                         candidate_trackers[i]
                     ),
                     check=False,
-                    timeout=tuning_client.get_dispatch_benchmark_timeout_s(),
+                    timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(),
                 ),
                 candidate_id=i,
                 command_need_device_id=True,
@@ -1298,7 +1304,7 @@ def benchmark_models(
                         candidate_trackers[0]
                     ),
                     check=False,
-                    timeout=tuning_client.get_model_benchmark_timeout_s(),
+                    timeout_seconds=tuning_client.get_model_benchmark_timeout_s(),
                 ),
                 candidate_id=0,
                 command_need_device_id=True,