From 7bd76f98a2f0f8337ddca946c595fb96a2b760d3 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 28 Aug 2024 07:25:59 +0000 Subject: [PATCH 1/8] add retry for max length --- sky/backends/cloud_vm_ray_backend.py | 63 ++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9545436f05c..76738d726ad 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3112,7 +3112,8 @@ def _setup_node(node_id: int) -> None: setup_script = log_lib.make_task_bash_script(setup, env_vars=setup_envs) encoded_script = shlex.quote(setup_script) - if detach_setup or _is_command_length_over_limit(encoded_script): + + def _dump_setup_script(setup_script: str, dump_script: bool) -> None: with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: f.write(setup_script) f.flush() @@ -3121,6 +3122,9 @@ def _setup_node(node_id: int) -> None: target=remote_setup_file_name, up=True, stream_logs=False) + + if detach_setup or _is_command_length_over_limit(encoded_script): + _dump_setup_script(setup_script) create_script_code = 'true' else: create_script_code = (f'{{ echo {encoded_script} > ' @@ -3128,20 +3132,39 @@ def _setup_node(node_id: int) -> None: if detach_setup: return + setup_log_path = os.path.join(self.log_dir, - f'setup-{runner.node_id}.log') - returncode = runner.run( + f'setup-{runner.node_id}.log') + def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: + returncode, stdout, stderr = runner.run( + setup_cmd, + log_path=setup_log_path, + process_stream=False, + # We do not source bashrc for setup, since bashrc is sourced + # in the script already. + # Skip an empty line and two lines due to the /bin/bash -i and + # source ~/.bashrc in the setup_cmd. + # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long + # bash: no job control in this shell + skip_lines=3, + require_outputs=True, + ) + return returncode, stdout, stderr + + returncode, stdout, stderr = _run_setup( f'{create_script_code} && {setup_cmd}', - log_path=setup_log_path, - process_stream=False, - # We do not source bashrc for setup, since bashrc is sourced - # in the script already. - # Skip an empty line and two lines due to the /bin/bash -i and - # source ~/.bashrc in the setup_cmd. - # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long - # bash: no job control in this shell - skip_lines=3, ) + if returncode == 255 and 'too long' in stdout + stderr: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a general + # length limit check before but it could be inaccurate on some + # systems. + logger.debug(f'Failed to run setup command inline due to ' + 'command length limit. ' + 'Dumping setup script to file and running it ' + f'with SSH.') + _dump_setup_script(setup_script, dump_script=True) + returncode, stdout, stderr = _run_setup(setup_cmd) def error_message() -> str: # Use the function to avoid tailing the file in success case @@ -3223,7 +3246,8 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) - if _is_command_length_over_limit(job_submit_cmd): + + def _dump_code_to_file(codegen: str) -> None: runners = handle.get_command_runners() head_runner = runners[0] with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: @@ -3238,6 +3262,9 @@ def _exec_code_on_head( target=script_path, up=True, stream_logs=False) + + if _is_command_length_over_limit(job_submit_cmd): + _dump_code_to_file(codegen) job_submit_cmd = f'{mkdir_code} && {code}' if managed_job_dag is not None: @@ -3263,6 +3290,16 @@ def _exec_code_on_head( job_submit_cmd, stream_logs=False, require_outputs=True) + if returncode == 255 and 'too long' in stdout + stderr: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a general + # length limit check before but it could be inaccurate on some + # systems. + _dump_code_to_file(codegen) + returncode, stdout, stderr = self.run_on_head(handle, + job_submit_cmd, + stream_logs=False, + require_outputs=True) # Happens when someone calls `sky exec` but remote is outdated # necessitating calling `sky launch`. From 2db39f1bf02c94baee541fc5bc2a744a3296b5d8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 28 Aug 2024 18:46:56 +0000 Subject: [PATCH 2/8] format --- sky/backends/cloud_vm_ray_backend.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 76738d726ad..c2f697188ef 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3113,7 +3113,7 @@ def _setup_node(node_id: int) -> None: env_vars=setup_envs) encoded_script = shlex.quote(setup_script) - def _dump_setup_script(setup_script: str, dump_script: bool) -> None: + def _dump_setup_script(setup_script: str) -> None: with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: f.write(setup_script) f.flush() @@ -3134,7 +3134,8 @@ def _dump_setup_script(setup_script: str, dump_script: bool) -> None: return setup_log_path = os.path.join(self.log_dir, - f'setup-{runner.node_id}.log') + f'setup-{runner.node_id}.log') + def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: returncode, stdout, stderr = runner.run( setup_cmd, @@ -3152,8 +3153,7 @@ def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: return returncode, stdout, stderr returncode, stdout, stderr = _run_setup( - f'{create_script_code} && {setup_cmd}', - ) + f'{create_script_code} && {setup_cmd}',) if returncode == 255 and 'too long' in stdout + stderr: # If the setup script is too long, we retry it with dumping # the script to a file and running it with SSH. We use a general @@ -3262,7 +3262,7 @@ def _dump_code_to_file(codegen: str) -> None: target=script_path, up=True, stream_logs=False) - + if _is_command_length_over_limit(job_submit_cmd): _dump_code_to_file(codegen) job_submit_cmd = f'{mkdir_code} && {code}' From d7afa7e619927319b914e23985e34f0971e20ed7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 28 Aug 2024 18:48:01 +0000 Subject: [PATCH 3/8] fix --- sky/backends/cloud_vm_ray_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index c2f697188ef..5068b8e9c7a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3163,7 +3163,7 @@ def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: 'command length limit. ' 'Dumping setup script to file and running it ' f'with SSH.') - _dump_setup_script(setup_script, dump_script=True) + _dump_setup_script(setup_script) returncode, stdout, stderr = _run_setup(setup_cmd) def error_message() -> str: From 1e44a1dbaf3222bf6038cbdc63499314562ba28a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 28 Aug 2024 18:50:23 +0000 Subject: [PATCH 4/8] format --- sky/backends/cloud_vm_ray_backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 5068b8e9c7a..513caad8e93 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3143,8 +3143,8 @@ def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: process_stream=False, # We do not source bashrc for setup, since bashrc is sourced # in the script already. - # Skip an empty line and two lines due to the /bin/bash -i and - # source ~/.bashrc in the setup_cmd. + # Skip an empty line and two lines due to the /bin/bash -i + # and source ~/.bashrc in the setup_cmd. # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long # bash: no job control in this shell skip_lines=3, @@ -3159,10 +3159,9 @@ def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: # the script to a file and running it with SSH. We use a general # length limit check before but it could be inaccurate on some # systems. - logger.debug(f'Failed to run setup command inline due to ' - 'command length limit. ' - 'Dumping setup script to file and running it ' - f'with SSH.') + logger.debug('Failed to run setup command inline due to ' + 'command length limit. Dumping setup script to ' + 'file and running it with SSH.') _dump_setup_script(setup_script) returncode, stdout, stderr = _run_setup(setup_cmd) From 59f6d3eff4ab12f9c9dbda1d6713d38a343cbbdf Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 2 Sep 2024 20:17:26 +0000 Subject: [PATCH 5/8] Fix and add test --- sky/backends/cloud_vm_ray_backend.py | 37 ++++++++++++++++------------ sky/exceptions.py | 4 +++ tests/test_smoke.py | 34 +++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 513caad8e93..65c4fbafc01 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3136,8 +3136,8 @@ def _dump_setup_script(setup_script: str) -> None: setup_log_path = os.path.join(self.log_dir, f'setup-{runner.node_id}.log') - def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: - returncode, stdout, stderr = runner.run( + def _run_setup(setup_cmd: str) -> int: + returncode = runner.run( setup_cmd, log_path=setup_log_path, process_stream=False, @@ -3147,23 +3147,28 @@ def _run_setup(setup_cmd: str) -> Tuple[int, str, str]: # and source ~/.bashrc in the setup_cmd. # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long # bash: no job control in this shell - skip_lines=3, - require_outputs=True, + skip_lines=3 ) - return returncode, stdout, stderr + return returncode - returncode, stdout, stderr = _run_setup( + returncode = _run_setup( f'{create_script_code} && {setup_cmd}',) - if returncode == 255 and 'too long' in stdout + stderr: - # If the setup script is too long, we retry it with dumping - # the script to a file and running it with SSH. We use a general - # length limit check before but it could be inaccurate on some - # systems. - logger.debug('Failed to run setup command inline due to ' - 'command length limit. Dumping setup script to ' - 'file and running it with SSH.') - _dump_setup_script(setup_script) - returncode, stdout, stderr = _run_setup(setup_cmd) + if returncode == 255: + is_message_too_long = False + with open(setup_log_path, 'r') as f: + if 'too long' in f.read(): + is_message_too_long = True + + if is_message_too_long: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a + # general length limit check before but it could be + # inaccurate on some systems. + logger.debug('Failed to run setup command inline due to ' + 'command length limit. Dumping setup script to ' + 'file and running it with SSH.') + _dump_setup_script(setup_script) + returncode = _run_setup(setup_cmd) def error_message() -> str: # Use the function to avoid tailing the file in success case diff --git a/sky/exceptions.py b/sky/exceptions.py index 99784a8c96d..15f3ea3f34e 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -100,9 +100,13 @@ def __init__(self, returncode: int, command: str, error_msg: str, self.command = command self.error_msg = error_msg self.detailed_reason = detailed_reason + if not command: message = error_msg else: + if len(command) > 100: + # Chunck the command to avoid overflow. + command = command[:100] + '...' message = (f'Command {command} failed with return code ' f'{returncode}.\n{error_msg}') super().__init__(message) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f338de2dda7..db0c05644cc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -34,6 +34,7 @@ import subprocess import sys import tempfile +import textwrap import time from typing import Dict, List, NamedTuple, Optional, Tuple import urllib.parse @@ -3436,6 +3437,39 @@ def test_gcp_zero_quota_failover(): run_one_test(test) +def test_long_setup_run_script(generic_cloud: str): + name = _get_cluster_name() + with tempfile.NamedTemporaryFile('w', prefix='sky_app_', suffix='.yaml') as f: + f.write(textwrap.dedent(""" \ + setup: | + echo "start long setup" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end long setup"\n') + f.write(textwrap.dedent(""" \ + run: | + echo "run" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end run"\n') + f.flush() + + test = Test( + 'long-setup-run-script', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup --detach-run --cpus 2+ {f.name}', + f'sky exec --detach-run {name} "echo hello"', + f'sky exec --detach-run {name} {f.name}', + f'sky logs {name} --status 1', + f'sky logs {name} --status 2', + f'sky logs {name} --status 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + # ---------- Testing skyserve ---------- From fc2c391f9e6a68a111eb421db9e5b1729c18643f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 2 Sep 2024 20:51:53 +0000 Subject: [PATCH 6/8] format --- sky/backends/cloud_vm_ray_backend.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 65c4fbafc01..dc623a685a6 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3147,12 +3147,10 @@ def _run_setup(setup_cmd: str) -> int: # and source ~/.bashrc in the setup_cmd. # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long # bash: no job control in this shell - skip_lines=3 - ) + skip_lines=3) return returncode - returncode = _run_setup( - f'{create_script_code} && {setup_cmd}',) + returncode = _run_setup(f'{create_script_code} && {setup_cmd}',) if returncode == 255: is_message_too_long = False with open(setup_log_path, 'r') as f: @@ -3164,9 +3162,10 @@ def _run_setup(setup_cmd: str) -> int: # the script to a file and running it with SSH. We use a # general length limit check before but it could be # inaccurate on some systems. - logger.debug('Failed to run setup command inline due to ' - 'command length limit. Dumping setup script to ' - 'file and running it with SSH.') + logger.debug( + 'Failed to run setup command inline due to ' + 'command length limit. Dumping setup script to ' + 'file and running it with SSH.') _dump_setup_script(setup_script) returncode = _run_setup(setup_cmd) From 88d67eb31db280a494f21e65416538ebec5839d7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 2 Sep 2024 20:52:00 +0000 Subject: [PATCH 7/8] format --- tests/test_smoke.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index db0c05644cc..2300819b04a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3439,15 +3439,18 @@ def test_gcp_zero_quota_failover(): def test_long_setup_run_script(generic_cloud: str): name = _get_cluster_name() - with tempfile.NamedTemporaryFile('w', prefix='sky_app_', suffix='.yaml') as f: - f.write(textwrap.dedent(""" \ + with tempfile.NamedTemporaryFile('w', prefix='sky_app_', + suffix='.yaml') as f: + f.write( + textwrap.dedent(""" \ setup: | echo "start long setup" """)) for i in range(1024 * 120): f.write(f' echo {i}\n') f.write(' echo "end long setup"\n') - f.write(textwrap.dedent(""" \ + f.write( + textwrap.dedent(""" \ run: | echo "run" """)) @@ -3470,6 +3473,7 @@ def test_long_setup_run_script(generic_cloud: str): ) run_one_test(test) + # ---------- Testing skyserve ---------- From 640be14b30fb1aab794d6dd4c82efbcfb0eaa411 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 2 Sep 2024 21:48:53 +0000 Subject: [PATCH 8/8] format --- sky/backends/cloud_vm_ray_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index dc623a685a6..2b2474de717 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3153,7 +3153,7 @@ def _run_setup(setup_cmd: str) -> int: returncode = _run_setup(f'{create_script_code} && {setup_cmd}',) if returncode == 255: is_message_too_long = False - with open(setup_log_path, 'r') as f: + with open(setup_log_path, 'r', encoding='utf-8') as f: if 'too long' in f.read(): is_message_too_long = True