timeout

allenai · Jul 20, 2023 · b4fdc6f · b4fdc6f
1 parent 6eda824
commit b4fdc6f
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 5 deletions.
diff --git a/olmo/train.py b/olmo/train.py
@@ -39,6 +39,7 @@
     move_to_device,
     peak_gpu_memory,
     syncronize_flag,
+    wait_on,
 )
 
 __all__ = ["SpeedMonitor", "LRMonitor", "Trainer"]
@@ -254,8 +255,7 @@ def save_sharded_checkpoint(self) -> Path:
         # replacing the temp directory with the final directory from rank 0 might not be immediately
         # realized in the file systems of the other ranks.
         # So we wait here across all ranks until that final checkpoint directory is visible.
-        while not checkpoint_dir.exists():
-            time.sleep(0.5)
+        wait_on(lambda: checkpoint_dir.exists(), "Waiting for checkpoint directory", timeout=10.0)
 
         # Remove old checkpoints.
         if self.cfg.save_num_checkpoints_to_keep > 0:
@@ -401,8 +401,7 @@ def save_unsharded_checkpoint(self) -> Path:
         # replacing the temp directory with the final directory from rank 0 might not be immediately
         # realized in the file systems of the other ranks.
         # So we wait here across all ranks until that final checkpoint directory is visible.
-        while not checkpoint_dir.exists():
-            time.sleep(0.5)
+        wait_on(lambda: checkpoint_dir.exists(), "Waiting for checkpoint directory", timeout=10.0)
 
         # Remove old checkpoints.
         if self.cfg.save_num_unsharded_checkpoints_to_keep > 0:

diff --git a/olmo/util.py b/olmo/util.py
@@ -2,9 +2,10 @@
 import os
 import socket
 import sys
+import time
 import warnings
 from datetime import datetime
-from typing import Any, Dict, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, Optional, TypeVar, Union
 
 import rich
 import torch
@@ -339,3 +340,12 @@ def syncronize_flag(flag: bool, device: torch.device) -> bool:
         return flag_tensor.item()  # type: ignore
     else:
         return flag
+
+
+def wait_on(condition: Callable[[], bool], description: str, timeout: float = 10.0):
+    """Wait on the condition function to return True."""
+    start_time = time.monotonic()
+    while not condition():
+        time.sleep(0.5)
+        if time.monotonic() - start_time > timeout:
+            raise TimeoutError(f"{description} timed out")