From d8fecfe69fac6e2889840c77b16e06f28a0d5fb8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 26 Mar 2024 07:44:08 +0000 Subject: [PATCH 01/11] save ray path --- sky/skylet/constants.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 9dd8ac25f24..4c22c42c38d 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -124,9 +124,16 @@ f'{SKY_PIP_CMD} list | grep "ray " | ' f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'|| {RAY_STATUS} || ' - f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION};' # pylint: disable=line-too-long + f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION} | tee /tmp/skypilot_ray_installation.log;' # pylint: disable=line-too-long + # Add missing PATH to make sure ray is in the PATH, when the + # previous ray installation happens in user's `~/.local` directory. + 'bin_path=$(grep "WARNING: The scripts ray,.* are installed in" ' + '/tmp/skypilot_ray_installation.log | ' + r'sed "s/.*installed in \'\(.*\)\' which is not on PATH.*/\1/");' # Writes ray path to file if it does not exist or the file is empty. - f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE};' + f'[ -s {SKY_RAY_PATH_FILE} ] || {{ [ -n $bin_path ] && ' + f'printf $bin_path > {SKY_RAY_PATH_FILE} || ' + f'which ray > {SKY_RAY_PATH_FILE}; }};' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long From 98fffc4548e803694abc2f674e46c2f831be1bc0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 26 Mar 2024 07:49:54 +0000 Subject: [PATCH 02/11] only use path in file when non empty --- sky/skylet/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 4c22c42c38d..02e7da31e90 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -30,13 +30,13 @@ # used for installing SkyPilot runtime (ray and skypilot). SKY_PYTHON_PATH_FILE = '~/.sky/python_path' SKY_RAY_PATH_FILE = '~/.sky/ray_path' -SKY_GET_PYTHON_PATH_CMD = (f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || ' +SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || ' 'which python3') # Python executable, e.g., /opt/conda/bin/python3 SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' # Ray executable, e.g., /opt/conda/bin/ray -SKY_RAY_CMD = f'$(cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)' +SKY_RAY_CMD = f'$([ -s {SKY_RAY_PATH_FILE} ] && cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)' # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the From 74b9d1e45efcfea67d9037b203aa94778182f5ac Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 26 Mar 2024 08:24:21 +0000 Subject: [PATCH 03/11] try grepping but fail --- sky/skylet/constants.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 02e7da31e90..aabd494831f 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -121,19 +121,20 @@ # latest ray port 6380, but those existing cluster launched before #1790 # that has ray cluster on the default port 6379 will be upgraded and # restarted. + 'echo PATH=$PATH; ' f'{SKY_PIP_CMD} list | grep "ray " | ' f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'|| {RAY_STATUS} || ' - f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION} | tee /tmp/skypilot_ray_installation.log;' # pylint: disable=line-too-long + f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION} | tee /tmp/skypilot_ray_installation.log; ' # pylint: disable=line-too-long # Add missing PATH to make sure ray is in the PATH, when the # previous ray installation happens in user's `~/.local` directory. 'bin_path=$(grep "WARNING: The scripts ray,.* are installed in" ' '/tmp/skypilot_ray_installation.log | ' - r'sed "s/.*installed in \'\(.*\)\' which is not on PATH.*/\1/");' + 'sed "s/.*installed in \'\\(.*\\)\' which is not on PATH.*/\\1/");' # Writes ray path to file if it does not exist or the file is empty. - f'[ -s {SKY_RAY_PATH_FILE} ] || {{ [ -n $bin_path ] && ' - f'printf $bin_path > {SKY_RAY_PATH_FILE} || ' - f'which ray > {SKY_RAY_PATH_FILE}; }};' + f'[ -s {SKY_RAY_PATH_FILE} ] || {{ [ -n "$bin_path" ] && ' + f'echo $bin_path > {SKY_RAY_PATH_FILE} || ' + f'which ray > {SKY_RAY_PATH_FILE}; }}; ' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long From 9108510a934b2abad69e7a63b2ae556bb0cd3830 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 26 Mar 2024 08:27:35 +0000 Subject: [PATCH 04/11] hardcode ~/.local/bin --- sky/skylet/constants.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index aabd494831f..f7a391f19ef 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -125,16 +125,12 @@ f'{SKY_PIP_CMD} list | grep "ray " | ' f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'|| {RAY_STATUS} || ' - f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION} | tee /tmp/skypilot_ray_installation.log; ' # pylint: disable=line-too-long + f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long # Add missing PATH to make sure ray is in the PATH, when the # previous ray installation happens in user's `~/.local` directory. - 'bin_path=$(grep "WARNING: The scripts ray,.* are installed in" ' - '/tmp/skypilot_ray_installation.log | ' - 'sed "s/.*installed in \'\\(.*\\)\' which is not on PATH.*/\\1/");' + 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. - f'[ -s {SKY_RAY_PATH_FILE} ] || {{ [ -n "$bin_path" ] && ' - f'echo $bin_path > {SKY_RAY_PATH_FILE} || ' - f'which ray > {SKY_RAY_PATH_FILE}; }}; ' + f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; ' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long From 14d9eae80d2d7338816318d19d1c5ddf608079ae Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 26 Mar 2024 08:34:52 +0000 Subject: [PATCH 05/11] format --- sky/skylet/constants.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index f7a391f19ef..4868e9649fd 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -30,13 +30,15 @@ # used for installing SkyPilot runtime (ray and skypilot). SKY_PYTHON_PATH_FILE = '~/.sky/python_path' SKY_RAY_PATH_FILE = '~/.sky/ray_path' -SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || ' +SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && ' + f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || ' 'which python3') # Python executable, e.g., /opt/conda/bin/python3 SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' # Ray executable, e.g., /opt/conda/bin/ray -SKY_RAY_CMD = f'$([ -s {SKY_RAY_PATH_FILE} ] && cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)' +SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && ' + f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)') # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the From efcd14c01e37ee0d7bf206e9b625b93b88bf857e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 29 Mar 2024 19:09:37 +0000 Subject: [PATCH 06/11] add comments --- sky/skylet/constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 4868e9649fd..b413765c8f0 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -130,6 +130,8 @@ f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long # Add missing PATH to make sure ray is in the PATH, when the # previous ray installation happens in user's `~/.local` directory. + # ~/.local/bin is added to the end of PATH to avoid conflicts with ray just + # installed in the conda environment. 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; ' From a2a2d1ecc1442cdb986b5e589eff9d59790122ae Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 Apr 2024 21:22:02 +0000 Subject: [PATCH 07/11] Add comments --- sky/skylet/constants.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index b413765c8f0..dc7f16ee616 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -110,6 +110,8 @@ # backend_utils.write_cluster_config. RAY_SKYPILOT_INSTALLATION_COMMANDS = ( 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;' + # Print the PATH in provision.log to help debug PATH issues. + 'echo PATH=$PATH; ' # Backward compatibility for ray upgrade (#3248): do not upgrade ray if the # ray cluster is already running, to avoid the ray cluster being restarted. # @@ -123,15 +125,18 @@ # latest ray port 6380, but those existing cluster launched before #1790 # that has ray cluster on the default port 6379 will be upgraded and # restarted. - 'echo PATH=$PATH; ' f'{SKY_PIP_CMD} list | grep "ray " | ' f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'|| {RAY_STATUS} || ' f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long - # Add missing PATH to make sure ray is in the PATH, when the - # previous ray installation happens in user's `~/.local` directory. - # ~/.local/bin is added to the end of PATH to avoid conflicts with ray just - # installed in the conda environment. + # In some envs, e.g. pip does not have permission to write under /opt/conda + # ray package will be installed under ~/.local/bin. If the user's PATH does + # not include ~/.local/bin (the pip install will have the output: `WARNING: + # The scripts ray, rllib, serve and tune are installed in '~/.local/bin' + # which is not on PATH.`), causing an empty SKY_RAY_PATH_FILE later. + # + # Here, we add ~/.local/bin to the end of the PATH to make sure the issues + # mentioned above are resolved. 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; ' From 48e57e214e734edeeac60c7e2775917bce67b200 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 Apr 2024 21:24:07 +0000 Subject: [PATCH 08/11] format --- sky/skylet/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index dc7f16ee616..4db25755a9d 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -131,7 +131,7 @@ f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long # In some envs, e.g. pip does not have permission to write under /opt/conda # ray package will be installed under ~/.local/bin. If the user's PATH does - # not include ~/.local/bin (the pip install will have the output: `WARNING: + # not include ~/.local/bin (the pip install will have the output: `WARNING: # The scripts ray, rllib, serve and tune are installed in '~/.local/bin' # which is not on PATH.`), causing an empty SKY_RAY_PATH_FILE later. # From 4b40d30365e12b36e3c84b7b019d56755702c6b5 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 Apr 2024 23:00:07 +0000 Subject: [PATCH 09/11] avoid backward compat test conflict --- tests/backward_compatibility_tests.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 424a0f5f9f7..f030dd3460d 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -13,7 +13,7 @@ # bash tests/backward_compatibility_tests.sh #!/bin/bash -set -ev +set -evx need_launch=${1:-0} start_from=${2:-0} @@ -160,30 +160,33 @@ fi # Test spot jobs to make sure existing jobs and new job can run correctly, after # the spot controller is updated. +# Get a new uuid to avoid conflict with previous back-compat tests. +uuid=$(uuidgen) +SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}-7 if [ "$start_from" -le 7 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true -sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-0 "echo hi; sleep 1000" -sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-1 "echo hi; sleep 300" +sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000" +sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-1 "echo hi; sleep 300" conda activate sky-back-compat-current rm -r ~/.sky/wheels || true -s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-1) +s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-1) echo "$s" echo "$s" | grep " hi" || exit 1 -sky spot launch -d --cloud ${CLOUD} -y -n ${CLUSTER_NAME}-7-2 "echo hi; sleep 60" -s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-2) +sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 60" +s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-2) echo "$s" echo "$s" | grep " hi" || exit 1 -s=$(sky spot queue | grep ${CLUSTER_NAME}-7) +s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7) echo "$s" echo "$s" | grep "RUNNING" | wc -l | grep 3 || exit 1 -sky spot cancel -y -n ${CLUSTER_NAME}-7-0 -sky spot logs -n "${CLUSTER_NAME}-7-1" -s=$(sky spot queue | grep ${CLUSTER_NAME}-7) +sky spot cancel -y -n ${SPOT_JOB_JOB_NAME}-7-0 +sky spot logs -n "${SPOT_JOB_JOB_NAME}-7-1" +s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7) echo "$s" echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1 echo "$s" | grep "CANCELLED" | wc -l | grep 1 || exit 1 fi sky down ${CLUSTER_NAME}* -y -sky spot cancel -n ${CLUSTER_NAME}* -y +sky spot cancel -n ${SPOT_JOB_JOB_NAME}* -y From e03e7a990fe640bae46489390a9394c8c7f09e93 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 Apr 2024 23:28:01 +0000 Subject: [PATCH 10/11] fix sleep --- tests/backward_compatibility_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index f030dd3460d..bd59718cfad 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -162,7 +162,7 @@ fi # the spot controller is updated. # Get a new uuid to avoid conflict with previous back-compat tests. uuid=$(uuidgen) -SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}-7 +SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4} if [ "$start_from" -le 7 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true @@ -173,7 +173,7 @@ rm -r ~/.sky/wheels || true s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-1) echo "$s" echo "$s" | grep " hi" || exit 1 -sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 60" +sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 40" s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-2) echo "$s" echo "$s" | grep " hi" || exit 1 From 19985e92ea060d9306e1deb8a036c9819c2cbc84 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 15 Apr 2024 06:13:33 +0000 Subject: [PATCH 11/11] Fix the task ID for spot pipeline --- sky/spot/controller.py | 12 +++++++++--- tests/test_smoke.py | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sky/spot/controller.py b/sky/spot/controller.py index 8a5a4cceb07..9308d1dd532 100644 --- a/sky/spot/controller.py +++ b/sky/spot/controller.py @@ -60,9 +60,15 @@ def __init__(self, job_id: int, dag_yaml: str, # the user can have the same id for multiple recoveries. # Example value: sky-2022-10-04-22-46-52-467694_my-spot-name_spot_id-17-0 job_id_env_vars = [] - for i in range(len(self._dag.tasks)): - task_name = self._dag_name - if len(self._dag.tasks) > 1: + for i, task in enumerate(self._dag.tasks): + if len(self._dag.tasks) <= 1: + task_name = self._dag_name + else: + task_name = task.name + # This is guaranteed by the spot_launch API, where we fill in + # the task.name with + # dag_utils.maybe_infer_and_fill_dag_and_task_names. + assert task_name is not None, self._dag task_name = f'{self._dag_name}_{task_name}' job_id_env_var = common_utils.get_global_job_id( self._backend.run_timestamp, diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 944818f05dc..4a549710be5 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2169,7 +2169,7 @@ def test_spot(generic_cloud: str): @pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances @pytest.mark.managed_spot def test_spot_pipeline(generic_cloud: str): - """Test a spot pipeline.""" + """Test a spot pipeline.""" name = _get_cluster_name() test = Test( 'spot-pipeline', @@ -2358,7 +2358,7 @@ def test_spot_pipeline_recovery_aws(aws_config_region): # separated by `-`. ( f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - 'cut -d\'-\' -f2 | rev`;' + 'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;' f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' # TODO(zhwu): fix the name for spot cluster. @@ -2407,7 +2407,7 @@ def test_spot_pipeline_recovery_gcp(): # SKYPILOT_TASK_ID, which gets the second to last field # separated by `-`. (f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - f'cut -d\'-\' -f2 | rev`;{terminate_cmd}'), + f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), 'sleep 60', f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', 'sleep 200',