From 0203971a36dbcf0a6d3615e6ed25f3f6d11b53f6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 2 Sep 2024 15:31:56 -0700 Subject: [PATCH] [Core] Add retry for ssh max length (#3884) * add retry for max length * format * fix * format * Fix and add test * format * format * format --- sky/backends/cloud_vm_ray_backend.py | 68 ++++++++++++++++++++++------ sky/exceptions.py | 4 ++ tests/test_smoke.py | 38 ++++++++++++++++ 3 files changed, 96 insertions(+), 14 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 816eefc502f..ca18f44f6da 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3112,7 +3112,8 @@ def _setup_node(node_id: int) -> None: setup_script = log_lib.make_task_bash_script(setup, env_vars=setup_envs) encoded_script = shlex.quote(setup_script) - if detach_setup or _is_command_length_over_limit(encoded_script): + + def _dump_setup_script(setup_script: str) -> None: with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: f.write(setup_script) f.flush() @@ -3121,6 +3122,9 @@ def _setup_node(node_id: int) -> None: target=remote_setup_file_name, up=True, stream_logs=False) + + if detach_setup or _is_command_length_over_limit(encoded_script): + _dump_setup_script(setup_script) create_script_code = 'true' else: create_script_code = (f'{{ echo {encoded_script} > ' @@ -3128,20 +3132,42 @@ def _setup_node(node_id: int) -> None: if detach_setup: return + setup_log_path = os.path.join(self.log_dir, f'setup-{runner.node_id}.log') - returncode = runner.run( - f'{create_script_code} && {setup_cmd}', - log_path=setup_log_path, - process_stream=False, - # We do not source bashrc for setup, since bashrc is sourced - # in the script already. - # Skip an empty line and two lines due to the /bin/bash -i and - # source ~/.bashrc in the setup_cmd. - # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long - # bash: no job control in this shell - skip_lines=3, - ) + + def _run_setup(setup_cmd: str) -> int: + returncode = runner.run( + setup_cmd, + log_path=setup_log_path, + process_stream=False, + # We do not source bashrc for setup, since bashrc is sourced + # in the script already. + # Skip an empty line and two lines due to the /bin/bash -i + # and source ~/.bashrc in the setup_cmd. + # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long + # bash: no job control in this shell + skip_lines=3) + return returncode + + returncode = _run_setup(f'{create_script_code} && {setup_cmd}',) + if returncode == 255: + is_message_too_long = False + with open(setup_log_path, 'r', encoding='utf-8') as f: + if 'too long' in f.read(): + is_message_too_long = True + + if is_message_too_long: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a + # general length limit check before but it could be + # inaccurate on some systems. + logger.debug( + 'Failed to run setup command inline due to ' + 'command length limit. Dumping setup script to ' + 'file and running it with SSH.') + _dump_setup_script(setup_script) + returncode = _run_setup(setup_cmd) def error_message() -> str: # Use the function to avoid tailing the file in success case @@ -3223,7 +3249,8 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) - if _is_command_length_over_limit(job_submit_cmd): + + def _dump_code_to_file(codegen: str) -> None: runners = handle.get_command_runners() head_runner = runners[0] with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: @@ -3238,6 +3265,9 @@ def _exec_code_on_head( target=script_path, up=True, stream_logs=False) + + if _is_command_length_over_limit(job_submit_cmd): + _dump_code_to_file(codegen) job_submit_cmd = f'{mkdir_code} && {code}' if managed_job_dag is not None: @@ -3263,6 +3293,16 @@ def _exec_code_on_head( job_submit_cmd, stream_logs=False, require_outputs=True) + if returncode == 255 and 'too long' in stdout + stderr: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a general + # length limit check before but it could be inaccurate on some + # systems. + _dump_code_to_file(codegen) + returncode, stdout, stderr = self.run_on_head(handle, + job_submit_cmd, + stream_logs=False, + require_outputs=True) # Happens when someone calls `sky exec` but remote is outdated # necessitating calling `sky launch`. diff --git a/sky/exceptions.py b/sky/exceptions.py index 99784a8c96d..15f3ea3f34e 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -100,9 +100,13 @@ def __init__(self, returncode: int, command: str, error_msg: str, self.command = command self.error_msg = error_msg self.detailed_reason = detailed_reason + if not command: message = error_msg else: + if len(command) > 100: + # Chunck the command to avoid overflow. + command = command[:100] + '...' message = (f'Command {command} failed with return code ' f'{returncode}.\n{error_msg}') super().__init__(message) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 7b5ce389b8c..63ccd19857d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -34,6 +34,7 @@ import subprocess import sys import tempfile +import textwrap import time from typing import Dict, List, NamedTuple, Optional, Tuple import urllib.parse @@ -3436,6 +3437,43 @@ def test_gcp_zero_quota_failover(): run_one_test(test) +def test_long_setup_run_script(generic_cloud: str): + name = _get_cluster_name() + with tempfile.NamedTemporaryFile('w', prefix='sky_app_', + suffix='.yaml') as f: + f.write( + textwrap.dedent(""" \ + setup: | + echo "start long setup" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end long setup"\n') + f.write( + textwrap.dedent(""" \ + run: | + echo "run" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end run"\n') + f.flush() + + test = Test( + 'long-setup-run-script', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup --detach-run --cpus 2+ {f.name}', + f'sky exec --detach-run {name} "echo hello"', + f'sky exec --detach-run {name} {f.name}', + f'sky logs {name} --status 1', + f'sky logs {name} --status 2', + f'sky logs {name} --status 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + # ---------- Testing skyserve ----------