diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index 8970737483d..4832fa72c04 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -30,3 +30,4 @@ envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass. +4 diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml index f46588e9aae..b43127c9361 100644 --- a/llm/axolotl/axolotl.yaml +++ b/llm/axolotl/axolotl.yaml @@ -16,9 +16,3 @@ run: | envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. - - - - - - diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6ff9731033e..6933d3dfa0d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3177,6 +3177,11 @@ def _setup_node(node_id: int) -> None: process_stream=False, # We do not source bashrc for setup, since bashrc is sourced # in the script already. + # Skip two lines due to the /bin/bash -i and source ~/.bashrc + # in the setup_cmd. + # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long + # bash: no job control in this shell + skip_lines=2, ) def error_message() -> str: diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 3ed69d0e2a1..f43296c2f1e 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -66,7 +66,7 @@ def ssh_options_list( arg_dict = { # SSH port 'Port': port, - # Supresses initial fingerprint verification. + # Suppresses initial fingerprint verification. 'StrictHostKeyChecking': 'no', # SSH IP and fingerprint pairs no longer added to known_hosts. # This is to remove a 'REMOTE HOST IDENTIFICATION HAS CHANGED' @@ -74,6 +74,10 @@ def ssh_options_list( # deleted node, because the fingerprints will not match in # that case. 'UserKnownHostsFile': os.devnull, + # Suppresses the warning messages, such as: + # Warning: Permanently added '34.69.216.203' (ED25519) to the list of + # known hosts. + 'LogLevel': 'ERROR', # Try fewer extraneous key pairs. 'IdentitiesOnly': 'yes', # Abort if port forwarding fails (instead of just printing to @@ -216,7 +220,9 @@ def run( stream_logs: bool = True, ssh_mode: SshMode = SshMode.NON_INTERACTIVE, separate_stderr: bool = False, + connect_timeout: Optional[int] = None, source_bashrc: bool = False, + skip_lines: int = 0, **kwargs) -> Union[int, Tuple[int, str, str]]: """Runs the command on the cluster. @@ -228,6 +234,14 @@ def run( ssh_mode: The mode to use for ssh. See SSHMode for more details. separate_stderr: Whether to separate stderr from stdout. + connect_timeout: timeout in seconds for the ssh connection. + source_bashrc: Whether to source the ~/.bashrc before running the + command. + skip_lines: The number of lines to skip at the beginning of the + output. This is used when the output is not processed by + SkyPilot but we still want to get rid of some warning messages, + such as SSH warnings. + Returns: returncode @@ -393,6 +407,7 @@ def run( separate_stderr: bool = False, connect_timeout: Optional[int] = None, source_bashrc: bool = False, + skip_lines: int = 0, **kwargs) -> Union[int, Tuple[int, str, str]]: """Uses 'ssh' to run 'cmd' on a node with ip. @@ -410,7 +425,13 @@ def run( ssh_mode: The mode to use for ssh. See SSHMode for more details. separate_stderr: Whether to separate stderr from stdout. - + connect_timeout: timeout in seconds for the ssh connection. + source_bashrc: Whether to source the bashrc before running the + command. + skip_lines: The number of lines to skip at the beginning of the + output. This is used when the output is not processed by + SkyPilot but we still want to get rid of some warning messages, + such as SSH warnings. Returns: returncode @@ -431,12 +452,8 @@ def run( cmd, process_stream, separate_stderr, - # A hack to remove the following SSH warning+bash warnings (twice): - # Warning: Permanently added 'xx.xx.xx.xx' to the list of known... - # bash: cannot set terminal process group - # bash: no job control in this shell - # When not source_bashrc, the bash warning will only show once. - skip_lines=5 if source_bashrc else 3, + # +1 to skip first new line. + skip_lines=skip_lines + 1, source_bashrc=source_bashrc) command = base_ssh_command + [shlex.quote(command_str)] diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi index e8f12ef6ebe..9fbad243775 100644 --- a/sky/utils/command_runner.pyi +++ b/sky/utils/command_runner.pyi @@ -59,6 +59,9 @@ class CommandRunner: process_stream: bool = ..., stream_logs: bool = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> int: ... @@ -71,6 +74,9 @@ class CommandRunner: process_stream: bool = ..., stream_logs: bool = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> Tuple[int, str, str]: ... @@ -83,6 +89,9 @@ class CommandRunner: process_stream: bool = ..., stream_logs: bool = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> Union[Tuple[int, str, str], int]: ... @@ -136,6 +145,9 @@ class SSHCommandRunner(CommandRunner): stream_logs: bool = ..., ssh_mode: SshMode = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> int: ... @@ -150,6 +162,9 @@ class SSHCommandRunner(CommandRunner): stream_logs: bool = ..., ssh_mode: SshMode = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> Tuple[int, str, str]: ... @@ -164,6 +179,9 @@ class SSHCommandRunner(CommandRunner): stream_logs: bool = ..., ssh_mode: SshMode = ..., separate_stderr: bool = ..., + connect_timeout: Optional[int] = ..., + source_bashrc: bool = ..., + skip_lines: int = ..., **kwargs) -> Union[Tuple[int, str, str], int]: ... diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1eb1c621976..db8f684c228 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -270,34 +270,76 @@ def test_example_app(): run_one_test(test) +_VALIDATE_LAUNCH_OUTPUT = ( + # Validate the output of the job submission: + # I 05-23 07:52:47 cloud_vm_ray_backend.py:3217] Running setup on 1 node. + # running setup + # I 05-23 07:52:49 cloud_vm_ray_backend.py:3230] Setup completed. + # I 05-23 07:52:55 cloud_vm_ray_backend.py:3319] Job submitted with Job ID: 1 + # I 05-23 07:52:58 log_lib.py:408] Start streaming logs for job 1. + # INFO: Tip: use Ctrl-C to exit log streaming (task will not be killed). + # INFO: Waiting for task resources on 1 node. This will block if the cluster is full. + # INFO: All task resources reserved. + # INFO: Reserved IPs: ['10.128.0.127'] + # (min, pid=4164) # conda environments: + # (min, pid=4164) # + # (min, pid=4164) base * /opt/conda + # (min, pid=4164) + # (min, pid=4164) task run finish + # INFO: Job finished (status: SUCCEEDED). + 'echo "$s" && echo "==Validating setup output==" && ' + 'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' + 'echo "==Validating running output hints==" && echo "$s" | ' + 'grep -A 1 "Job submitted with Job ID:" | ' + 'grep "Start streaming logs for job" && ' + 'echo "==Validating task output starting==" && echo "$s" | ' + 'grep -A 1 "INFO: Reserved IPs" | grep "(min, pid=" && ' + 'echo "==Validating task output ending==" && ' + 'echo "$s" | grep -A 1 "task run finish" | ' + 'grep "INFO: Job finished (status: SUCCEEDED)" && ' + 'echo "==Validating task output ending 2==" && ' + 'echo "$s" | grep -A 1 "INFO: Job finished (status: SUCCEEDED)" | ' + 'grep "Job ID:"') + + # ---------- A minimal task ---------- def test_minimal(generic_cloud: str): name = _get_cluster_name() + validate_output = _VALIDATE_LAUNCH_OUTPUT + # Kubernetes will output a SSH Warning for proxy jump, which will cause + # the output validation fail. We skip the check for kubernetes for now. + if generic_cloud.lower() == 'kubernetes': + validate_output = 'true' test = Test( 'minimal', [ - f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml', + f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}', + # Output validation done. f'sky logs {name} 1 --status', f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. + # Test launch output again on existing cluster + f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. # Check the logs downloading f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log', # Ensure the raylet process has the correct file descriptor limit. f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', - f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky logs {name} 3 --status', # Ensure the job succeeded. # Install jq for the next test. f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', # Check the cluster info f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', f'sky logs {name} 5 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', + f'sky logs {name} 6 --status', # Ensure the job succeeded. # Test '-c' for exec f'sky exec -c {name} echo', - f'sky logs {name} 6 --status', - f'sky exec echo -c {name}', f'sky logs {name} 7 --status', + f'sky exec echo -c {name}', + f'sky logs {name} 8 --status', f'sky exec -c {name} echo hi test', - f'sky logs {name} 8 | grep "hi test"', + f'sky logs {name} 9 | grep "hi test"', f'sky exec {name} && exit 1 || true', f'sky exec -c {name} && exit 1 || true', ], diff --git a/tests/test_yamls/minimal.yaml b/tests/test_yamls/minimal.yaml index d7f9d4482f2..832b55e507f 100644 --- a/tests/test_yamls/minimal.yaml +++ b/tests/test_yamls/minimal.yaml @@ -5,3 +5,4 @@ setup: | run: | conda env list + echo "task run finish"