Skip to content

Commit

Permalink
[UX] Fix the output skipped lines (#3586)
Browse files Browse the repository at this point in the history
* Quote the command correctly when source_bashrc is not set

* Remove unnecessary source bashrc

* format

* Fix setup script for conda

* Add comment

* format

* Separate env for skypilot

* add test smoke

* add system site-packages

* add test for default to non-base conda env

* Fix controllers and ray node providers

* move activate to maybe_skylet

* Make axolotl example work for kubernetes

* fix axolotl

* add test for 3.12

* format

* Fix docker PATH

* format

* add axolotl image in test

* address comments

* revert grpcio version as it is only installed in our runtime env

* refactor command for env set up

* switch to curl as CentOS may not have wget installed but have curl

* add l4 in command

* fix dependency for test

* fix python path for ray executable

* Fix azure launch

* add comments

* fix test

* fix smoke

* fix name

* fix

* fix usage

* fix usage for accelerators

* fix event

* fix name

* fix

* Fix connection output

* Fix

* check twice

* add Job ID

* fix

* avoid validate output for kubernetes due to ssh jump

* fix validate

* fix axolotl

* add comment

* Update sky/backends/cloud_vm_ray_backend.py

Co-authored-by: Tian Xia <[email protected]>

* Update sky/utils/command_runner.py

Co-authored-by: Tian Xia <[email protected]>

* fix comment

---------

Co-authored-by: Tian Xia <[email protected]>
  • Loading branch information
Michaelvll and cblmemo authored May 23, 2024
1 parent 97cba00 commit 60d0c26
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 21 deletions.
1 change: 1 addition & 0 deletions llm/axolotl/axolotl-spot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ envs:
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.

4
6 changes: 0 additions & 6 deletions llm/axolotl/axolotl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,3 @@ run: |
envs:
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.






5 changes: 5 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3177,6 +3177,11 @@ def _setup_node(node_id: int) -> None:
process_stream=False,
# We do not source bashrc for setup, since bashrc is sourced
# in the script already.
# Skip two lines due to the /bin/bash -i and source ~/.bashrc
# in the setup_cmd.
# bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
# bash: no job control in this shell
skip_lines=2,
)

def error_message() -> str:
Expand Down
33 changes: 25 additions & 8 deletions sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,18 @@ def ssh_options_list(
arg_dict = {
# SSH port
'Port': port,
# Supresses initial fingerprint verification.
# Suppresses initial fingerprint verification.
'StrictHostKeyChecking': 'no',
# SSH IP and fingerprint pairs no longer added to known_hosts.
# This is to remove a 'REMOTE HOST IDENTIFICATION HAS CHANGED'
# warning if a new node has the same IP as a previously
# deleted node, because the fingerprints will not match in
# that case.
'UserKnownHostsFile': os.devnull,
# Suppresses the warning messages, such as:
# Warning: Permanently added '34.69.216.203' (ED25519) to the list of
# known hosts.
'LogLevel': 'ERROR',
# Try fewer extraneous key pairs.
'IdentitiesOnly': 'yes',
# Abort if port forwarding fails (instead of just printing to
Expand Down Expand Up @@ -216,7 +220,9 @@ def run(
stream_logs: bool = True,
ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
separate_stderr: bool = False,
connect_timeout: Optional[int] = None,
source_bashrc: bool = False,
skip_lines: int = 0,
**kwargs) -> Union[int, Tuple[int, str, str]]:
"""Runs the command on the cluster.
Expand All @@ -228,6 +234,14 @@ def run(
ssh_mode: The mode to use for ssh.
See SSHMode for more details.
separate_stderr: Whether to separate stderr from stdout.
connect_timeout: timeout in seconds for the ssh connection.
source_bashrc: Whether to source the ~/.bashrc before running the
command.
skip_lines: The number of lines to skip at the beginning of the
output. This is used when the output is not processed by
SkyPilot but we still want to get rid of some warning messages,
such as SSH warnings.
Returns:
returncode
Expand Down Expand Up @@ -393,6 +407,7 @@ def run(
separate_stderr: bool = False,
connect_timeout: Optional[int] = None,
source_bashrc: bool = False,
skip_lines: int = 0,
**kwargs) -> Union[int, Tuple[int, str, str]]:
"""Uses 'ssh' to run 'cmd' on a node with ip.
Expand All @@ -410,7 +425,13 @@ def run(
ssh_mode: The mode to use for ssh.
See SSHMode for more details.
separate_stderr: Whether to separate stderr from stdout.
connect_timeout: timeout in seconds for the ssh connection.
source_bashrc: Whether to source the bashrc before running the
command.
skip_lines: The number of lines to skip at the beginning of the
output. This is used when the output is not processed by
SkyPilot but we still want to get rid of some warning messages,
such as SSH warnings.
Returns:
returncode
Expand All @@ -431,12 +452,8 @@ def run(
cmd,
process_stream,
separate_stderr,
# A hack to remove the following SSH warning+bash warnings (twice):
# Warning: Permanently added 'xx.xx.xx.xx' to the list of known...
# bash: cannot set terminal process group
# bash: no job control in this shell
# When not source_bashrc, the bash warning will only show once.
skip_lines=5 if source_bashrc else 3,
# +1 to skip first new line.
skip_lines=skip_lines + 1,
source_bashrc=source_bashrc)
command = base_ssh_command + [shlex.quote(command_str)]

Expand Down
18 changes: 18 additions & 0 deletions sky/utils/command_runner.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class CommandRunner:
process_stream: bool = ...,
stream_logs: bool = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> int:
...

Expand All @@ -71,6 +74,9 @@ class CommandRunner:
process_stream: bool = ...,
stream_logs: bool = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> Tuple[int, str, str]:
...

Expand All @@ -83,6 +89,9 @@ class CommandRunner:
process_stream: bool = ...,
stream_logs: bool = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> Union[Tuple[int, str, str], int]:
...

Expand Down Expand Up @@ -136,6 +145,9 @@ class SSHCommandRunner(CommandRunner):
stream_logs: bool = ...,
ssh_mode: SshMode = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> int:
...

Expand All @@ -150,6 +162,9 @@ class SSHCommandRunner(CommandRunner):
stream_logs: bool = ...,
ssh_mode: SshMode = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> Tuple[int, str, str]:
...

Expand All @@ -164,6 +179,9 @@ class SSHCommandRunner(CommandRunner):
stream_logs: bool = ...,
ssh_mode: SshMode = ...,
separate_stderr: bool = ...,
connect_timeout: Optional[int] = ...,
source_bashrc: bool = ...,
skip_lines: int = ...,
**kwargs) -> Union[Tuple[int, str, str], int]:
...

Expand Down
56 changes: 49 additions & 7 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,34 +270,76 @@ def test_example_app():
run_one_test(test)


_VALIDATE_LAUNCH_OUTPUT = (
# Validate the output of the job submission:
# I 05-23 07:52:47 cloud_vm_ray_backend.py:3217] Running setup on 1 node.
# running setup
# I 05-23 07:52:49 cloud_vm_ray_backend.py:3230] Setup completed.
# I 05-23 07:52:55 cloud_vm_ray_backend.py:3319] Job submitted with Job ID: 1
# I 05-23 07:52:58 log_lib.py:408] Start streaming logs for job 1.
# INFO: Tip: use Ctrl-C to exit log streaming (task will not be killed).
# INFO: Waiting for task resources on 1 node. This will block if the cluster is full.
# INFO: All task resources reserved.
# INFO: Reserved IPs: ['10.128.0.127']
# (min, pid=4164) # conda environments:
# (min, pid=4164) #
# (min, pid=4164) base * /opt/conda
# (min, pid=4164)
# (min, pid=4164) task run finish
# INFO: Job finished (status: SUCCEEDED).
'echo "$s" && echo "==Validating setup output==" && '
'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
'echo "==Validating running output hints==" && echo "$s" | '
'grep -A 1 "Job submitted with Job ID:" | '
'grep "Start streaming logs for job" && '
'echo "==Validating task output starting==" && echo "$s" | '
'grep -A 1 "INFO: Reserved IPs" | grep "(min, pid=" && '
'echo "==Validating task output ending==" && '
'echo "$s" | grep -A 1 "task run finish" | '
'grep "INFO: Job finished (status: SUCCEEDED)" && '
'echo "==Validating task output ending 2==" && '
'echo "$s" | grep -A 1 "INFO: Job finished (status: SUCCEEDED)" | '
'grep "Job ID:"')


# ---------- A minimal task ----------
def test_minimal(generic_cloud: str):
name = _get_cluster_name()
validate_output = _VALIDATE_LAUNCH_OUTPUT
# Kubernetes will output a SSH Warning for proxy jump, which will cause
# the output validation fail. We skip the check for kubernetes for now.
if generic_cloud.lower() == 'kubernetes':
validate_output = 'true'
test = Test(
'minimal',
[
f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}',
# Output validation done.
f'sky logs {name} 1 --status',
f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent.
# Test launch output again on existing cluster
f's=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {validate_output}',
f'sky logs {name} 2 --status',
f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent.
# Check the logs downloading
f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log',
# Ensure the raylet process has the correct file descriptor limit.
f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
f'sky logs {name} 2 --status', # Ensure the job succeeded.
f'sky logs {name} 3 --status', # Ensure the job succeeded.
# Install jq for the next test.
f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'',
# Check the cluster info
f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'',
f'sky logs {name} 4 --status', # Ensure the job succeeded.
f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
f'sky logs {name} 5 --status', # Ensure the job succeeded.
f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
f'sky logs {name} 6 --status', # Ensure the job succeeded.
# Test '-c' for exec
f'sky exec -c {name} echo',
f'sky logs {name} 6 --status',
f'sky exec echo -c {name}',
f'sky logs {name} 7 --status',
f'sky exec echo -c {name}',
f'sky logs {name} 8 --status',
f'sky exec -c {name} echo hi test',
f'sky logs {name} 8 | grep "hi test"',
f'sky logs {name} 9 | grep "hi test"',
f'sky exec {name} && exit 1 || true',
f'sky exec -c {name} && exit 1 || true',
],
Expand Down
1 change: 1 addition & 0 deletions tests/test_yamls/minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ setup: |
run: |
conda env list
echo "task run finish"

0 comments on commit 60d0c26

Please sign in to comment.