diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index ed157736007..6f46a12e34d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1307,7 +1307,8 @@ def _retry_zones( log_path = os.path.join(self.log_dir, 'provision.log') log_abs_path = os.path.abspath(log_path) if not dryrun: - os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True) + log_utils.create_and_symlink_log_dir( + os.path.expanduser(self.log_dir)) os.system(f'touch {log_path}') tail_cmd = f'tail -n100 -f {log_path}' logger.info('To view detailed progress: ' @@ -3052,7 +3053,7 @@ def _sync_workdir_node(runner: command_runner.CommandRunner) -> None: f'{style.BRIGHT}{workdir}{style.RESET_ALL}' f' -> ' f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}') - os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True) + log_utils.create_and_symlink_log_dir(os.path.expanduser(self.log_dir)) os.system(f'touch {log_path}') tail_cmd = f'tail -n100 -f {log_path}' logger.info('To view detailed progress: ' diff --git a/sky/callbacks/sky_callback/base.py b/sky/callbacks/sky_callback/base.py index 06fa073482c..7f4c5e862a5 100644 --- a/sky/callbacks/sky_callback/base.py +++ b/sky/callbacks/sky_callback/base.py @@ -9,6 +9,8 @@ import psutil +from sky.utils import log_utils + # NOTE: This must be the same as _SKY_REMOTE_BENCHMARK_DIR_SYMLINK # in sky/benchmark/benchmark_utils.py. _SKY_REMOTE_BENCHMARK_DIR = '~/sky_benchmark_dir' @@ -39,7 +41,7 @@ def __init__(self, log_dir, 'sky-callback-' + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')) log_dir = os.path.expanduser(log_dir) - os.makedirs(log_dir, exist_ok=True) + log_utils.create_and_symlink_log_dir(log_dir) # TODO(woosuk): Do not store the entire timestamps. self._step_begins = [] diff --git a/sky/provision/logging.py b/sky/provision/logging.py index 2ea3967d15c..4839d4352e1 100644 --- a/sky/provision/logging.py +++ b/sky/provision/logging.py @@ -9,6 +9,7 @@ import threading from sky import sky_logging +from sky.utils import log_utils @dataclasses.dataclass @@ -24,7 +25,7 @@ def setup_provision_logging(log_dir: str): try: # Redirect underlying provision logs to file. log_path = os.path.expanduser(os.path.join(log_dir, 'provision.log')) - os.makedirs(os.path.dirname(log_path), exist_ok=True) + log_utils.create_and_symlink_log_dir(os.path.dirname(log_path)) log_abs_path = pathlib.Path(log_path).expanduser().absolute() fh = logging.FileHandler(log_abs_path) fh.setFormatter(sky_logging.FORMATTER) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 84a6491605a..e13d81546d7 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -6,6 +6,7 @@ import sky SKY_LOGS_DIRECTORY = '~/sky_logs' +SKY_LATEST_LINK = 'sky_latest' SKY_REMOTE_WORKDIR = '~/sky_workdir' # Default Ray port is 6379. Default Ray dashboard port is 8265. diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index d184abd107e..aa8e3713efb 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -163,7 +163,7 @@ def run_with_log( log_path = os.path.expanduser(log_path) dirname = os.path.dirname(log_path) - os.makedirs(dirname, exist_ok=True) + log_utils.create_and_symlink_log_dir(dirname) # Redirect stderr to stdout when using ray, to preserve the order of # stdout and stderr. stdout_arg = stderr_arg = None diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 8529874092a..73ac20be801 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -11,6 +11,7 @@ from sky.skylet import constants from sky.skylet import log_lib from sky.utils import common_utils +from sky.utils import log_utils from sky.utils import subprocess_utils from sky.utils import timeline @@ -571,7 +572,7 @@ def run( command = base_ssh_command + [shlex.quote(command_str)] log_dir = os.path.expanduser(os.path.dirname(log_path)) - os.makedirs(log_dir, exist_ok=True) + log_utils.create_and_symlink_log_dir(log_dir) executable = None if not process_stream: @@ -750,7 +751,7 @@ def run( ] log_dir = os.path.expanduser(os.path.dirname(log_path)) - os.makedirs(log_dir, exist_ok=True) + log_utils.create_and_symlink_log_dir(log_dir) executable = None if not process_stream: diff --git a/sky/utils/log_utils.py b/sky/utils/log_utils.py index 90928b8014d..70cc3c10671 100644 --- a/sky/utils/log_utils.py +++ b/sky/utils/log_utils.py @@ -1,16 +1,24 @@ """Logging utils.""" import enum +import os from typing import List, Optional import colorama +import filelock import pendulum import prettytable from sky import sky_logging +from sky.skylet import constants from sky.utils import rich_utils logger = sky_logging.init_logger(__name__) +# Filelock for updating log directory symlink. +_SYMLINK_LOCK_PATH = os.path.expanduser( + os.path.join(constants.SKY_LOGS_DIRECTORY, '.symlink_latest.lock')) +_SYMLINK_LOCK_TIMEOUT_SECONDS = 10 + class LineProcessor(object): """A processor for log lines.""" @@ -191,3 +199,51 @@ def readable_time_duration(start: Optional[float], diff = diff.replace('hour', 'hr') return diff + + +def create_and_symlink_log_dir(log_dir: str): + """Create a log directory and symlink to it from parent directory. + + If file operations fail, will log a warning and return without error. + + Args: + log_dir (str): Expanded log directory path. + """ + # Fast path for logging to /dev/null + if log_dir == os.path.dirname(os.devnull): + return + os.makedirs(log_dir, exist_ok=True) + symlink_path = os.path.join(os.path.dirname(os.path.abspath(log_dir)), + constants.SKY_LATEST_LINK) + try: + with filelock.FileLock(_SYMLINK_LOCK_PATH, + _SYMLINK_LOCK_TIMEOUT_SECONDS): + if os.path.exists(symlink_path): + if os.path.islink(symlink_path): + try: + os.remove(symlink_path) + except FileNotFoundError: + pass + except OSError: + logger.warning( + 'Failed to remove old symlink to latest logs ' + f'at {symlink_path!r}.') + return + else: + logger.warning( + f'Failed to symlink to latest logs at {symlink_path!r}.' + 'Please remove the existing file/directory.') + return + try: + os.symlink(log_dir, symlink_path) + except OSError: + logger.warning(f'Failed to symlink to latest logs from ' + f'{symlink_path!r} to {log_dir!r}.') + return + except filelock.Timeout: + logger.warning( + f'Failed to symlink to latest logs at {symlink_path!r} ' + 'due to a timeout when trying to access the log directory lock. ' + 'Please try again or manually remove the lock ' + f'at {_SYMLINK_LOCK_PATH!r}.') + return diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 325a836cf4c..ea236193556 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1317,6 +1317,26 @@ def test_scp_logs(): run_one_test(test) +def test_symlink_latest_logs(generic_cloud: str): + name = _get_cluster_name() + test = Test( + 'symlink_latest_logs', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} -- echo hi > log1.txt || true', + 'eval symlink_path="~/sky_logs/sky_latest"; [ -L "$symlink_path" ] || exit 1 ;' + 'target_file=$(readlink -f "$symlink_path") || exit 1 ;' + 'grep -E "To view detailed progress: .+ $target_file" log1.txt || exit 1;' + f'sky launch -y -c {name} --cloud {generic_cloud} -- echo hi > log2.txt || true ;' + '[ -L "$symlink_path" ] || exit 1 ;' + 'target_file2=$(readlink -f "$symlink_path") || exit 1 ;' + 'grep -E "To view detailed progress: .+ $target_file2" log2.txt || exit 1;' + '[ "$target_file" != "$target_file2" ] || exit 1' + ], + 'rm log1.txt log2.txt 2> /dev/null || true', + ) + run_one_test(test) + + # ---------- Job Queue. ---------- @pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs @pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus