From 468409c51d3daf8a0df6632504a8cff0f324429e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 7 Nov 2024 17:03:08 +0800 Subject: [PATCH 01/22] event based smoke test --- tests/test_smoke.py | 99 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index cdfd9dfc7cb..b51e720e84a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -25,6 +25,7 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws +import enum import inspect import json import os @@ -60,6 +61,8 @@ from sky.data.data_utils import Rclone from sky.skylet import constants from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus from sky.utils import common_utils from sky.utils import resources_utils from sky.utils import subprocess_utils @@ -95,6 +98,64 @@ 'sleep 10; s=$(sky jobs queue);' 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') +_WAIT_UNTIL_CLUSTER_STATUS_IS = ( + # A while loop to wait until the cluster status + # becomes certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky status {cluster_name} --refresh | ' + 'awk "/^{cluster_name}/ ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' + 'if [ "$current_status" == "{cluster_status}" ]; ' + 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' + 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' + 'sleep 30; ' + 'done') + +_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( + # A while loop to wait until the cluster is not found or timeout + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; ' + 'fi; ' + 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' + ' echo "Cluster {cluster_name} successfully removed."; break; ' + 'fi; ' + 'echo "Waiting for cluster {name} to be removed..."; ' + 'sleep 15; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS = ( + # A while loop to wait until the job status + # contains certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky queue {cluster_name} | ' + 'awk "/{job_name}/ ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' + 'found=0; ' # Initialize found variable outside the loop + 'while read -r line; do ' # Read line by line + ' if [ "$line" == "{job_status}" ]; then ' # Check each line + ' echo "Target job status \'{job_status}\' reached."; ' + ' found=1; ' + ' break; ' # Break inner loop + ' fi; ' + 'done <<< "$current_status"; ' + 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found + 'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; ' + 'sleep 15; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace( + 'awk "/{job_name}/', 'awk "') + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -399,7 +460,6 @@ def test_launch_fast_with_autostop(generic_cloud: str): # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure # the VM is stopped. autostop_timeout = 600 if generic_cloud == 'azure' else 250 - test = Test( 'test_launch_fast_with_autostop', [ @@ -407,10 +467,12 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', f'sky status -r {name} | grep UP', - f'sleep {autostop_timeout}', # Ensure cluster is stopped - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Launch again. Do full output validation - we expect the cluster to re-launch f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', @@ -808,7 +870,10 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - 'sleep 60', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=60), f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', f'sky logs {name}-clone 1 --status', @@ -854,8 +919,8 @@ def test_gcp_mig(): # Check MIG exists. f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', f'sky autostop -i 0 --down -y {name}', - 'sleep 120', - f'sky status -r {name}; sky status {name} | grep "{name} not found"', + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, + timeout=120), f'gcloud compute instance-templates list | grep "sky-it-{name}"', # Launch again with the same region. The original instance template # should be removed. @@ -922,8 +987,10 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - 'sleep 60', - f'sky status -r {name} | grep "STOPPED"', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=80), f'sky start -y {name}', f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', @@ -944,7 +1011,10 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - 'sleep 100', # Ensure this is large enough, else GCP leaks. + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=100), f'sky start {name} -y', f'sky logs {name} 1 --status', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', @@ -972,13 +1042,18 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - 'sleep 40', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), f'sky launch -c {name} -y "echo hi"', f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + cluster_name=name, + job_status=JobStatus.FAILED.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), ], f'sky down -y {name}', ) From 7191844ae2a7466897c75fc42ed9c116936a0db5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 17:14:31 +0800 Subject: [PATCH 02/22] more event based smoke test --- tests/test_smoke.py | 47 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b51e720e84a..a11ff9d8ed8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -108,11 +108,11 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' - 'if [ "$current_status" == "{cluster_status}" ]; ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); ' + 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' - 'sleep 30; ' + 'sleep 15; ' 'done') _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( @@ -129,7 +129,7 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS = ( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( # A while loop to wait until the job status # contains certain status, with timeout. 'start_time=$SECONDS; ' @@ -138,7 +138,7 @@ ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' 'fi; ' 'current_status=$(sky queue {cluster_name} | ' - 'awk "/{job_name}/ ' + 'awk "\\$1 == \\"{job_id}\\" ' '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line @@ -153,8 +153,11 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace( - 'awk "/{job_name}/', 'awk "') +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_name}\\"', 'awk "') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"') DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -1083,8 +1086,10 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + cluster_name=name, + job_status=JobStatus.FAILED.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) ], f'sky down -y {name}', ) @@ -1888,10 +1893,15 @@ def test_multi_echo(generic_cloud: str): 'multi_echo', [ f'python examples/multi_echo.py {name} {generic_cloud}', - 'sleep 120', ] + # Ensure jobs succeeded. - [f'sky logs {name} {i + 1} --status' for i in range(32)] + + [ + _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=name, + job_id=i + 1, + job_status=JobStatus.SUCCEEDED.value, + timeout=120) for i in range(32) + ] + # Ensure monitor/autoscaler didn't crash on the 'assert not # unfulfilled' error. If process not found, grep->ssh returns 1. [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], @@ -1984,7 +1994,8 @@ def test_tpu(): f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. ], - f'sky down -y {name}', + 'echo "hello"', + #f'sky down -y {name}', timeout=30 * 60, # can take >20 mins ) run_one_test(test) @@ -2444,12 +2455,18 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - f'sleep 20', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - 'sleep 180', - f'sky status -r {name} | grep "INIT\|STOPPED"', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=200), ], f'sky down -y {name}', ) From 5cbebebae882ff172917727b4aa00ab767bd986e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 17:56:37 +0800 Subject: [PATCH 03/22] more test cases --- tests/test_smoke.py | 53 ++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index a11ff9d8ed8..9c422cda194 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -108,10 +108,10 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' 'if [[ "$current_status" =~ {cluster_status} ]]; ' - 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' - 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' + 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' + 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' 'sleep 15; ' 'done') @@ -143,21 +143,21 @@ 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line ' if [ "$line" == "{job_status}" ]; then ' # Check each line - ' echo "Target job status \'{job_status}\' reached."; ' + ' echo "Target job status {job_status} reached."; ' ' found=1; ' ' break; ' # Break inner loop ' fi; ' 'done <<< "$current_status"; ' 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found - 'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; ' + 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' 'sleep 15; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_name}\\"', 'awk "') + 'awk "\\$1 == \\"{job_id}\\"', 'awk "') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"') + 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -2489,9 +2489,12 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - 'sleep 260', - f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"' - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=280) + + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', ], f'sky down -y {name}', timeout=30 * 60, # 30 mins @@ -2527,8 +2530,10 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Ensure the cluster is UP and the autostop setting is reset ('-'). f'sky start -y {name}', @@ -2544,8 +2549,10 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Test restarting the idleness timer via exec: f'sky start -y {name}', @@ -2555,8 +2562,10 @@ def test_autostop(generic_cloud: str): f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), ], f'sky down -y {name}', timeout=total_timeout_minutes * 60, @@ -2773,15 +2782,19 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - 'sleep 90', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=90), f'sky start {name} -y', f'sky exec {name} -- ls myfile', f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - 'sleep 120', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=120), ], f'sky down -y {name}', ) From 6f6840901b8407be2e20d9093565813029b2f83e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 18:43:20 +0800 Subject: [PATCH 04/22] more test cases with managed jobs --- tests/test_smoke.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9c422cda194..339d7062b0a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -98,6 +98,8 @@ 'sleep 10; s=$(sky jobs queue);' 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') +# Cluster functions + _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. @@ -142,7 +144,7 @@ '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line - ' if [ "$line" == "{job_status}" ]; then ' # Check each line + ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line ' echo "Target job status {job_status} reached."; ' ' found=1; ' ' break; ' # Break inner loop @@ -153,12 +155,18 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') +# Managed job functions + +_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( + 'sky queue {cluster_name}', + 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -1053,7 +1061,7 @@ def test_aws_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( cluster_name=name, job_status=JobStatus.FAILED.value, timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), @@ -1086,7 +1094,7 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( cluster_name=name, job_status=JobStatus.FAILED.value, timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) @@ -2814,14 +2822,21 @@ def test_managed_jobs(generic_cloud: str): [ f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-1', + job_status= + f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + timeout=60), + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status= + f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + timeout=60), f'sky jobs cancel -y -n {name}-1', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep CANCELLED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=f'{JobStatus.CANCELLED.value}', + timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', @@ -2891,9 +2906,11 @@ def test_managed_jobs_failed_setup(generic_cloud: str): 'managed_jobs_failed_setup', [ f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', - 'sleep 330', # Make sure the job failed quickly. - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{JobStatus.FAILED_SETUP.value}', + timeout=330), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. From 1f67691aec7a6d66cf7733190e7ce5a142c361cb Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 18:58:41 +0800 Subject: [PATCH 05/22] bug fix --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 339d7062b0a..043cb63ea96 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2834,7 +2834,7 @@ def test_managed_jobs(generic_cloud: str): timeout=60), f'sky jobs cancel -y -n {name}-1', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', + job_name=f'{name}-1', job_status=f'{JobStatus.CANCELLED.value}', timeout=230), # Test the functionality for logging. From be7964ece6275ca782c17d50c5f8db5187cf9bfd Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 16:41:13 +0800 Subject: [PATCH 06/22] bump up seconds --- tests/test_smoke.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e6daae0e588..7d415708cfc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -114,7 +114,7 @@ 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( @@ -128,7 +128,7 @@ ' echo "Cluster {cluster_name} successfully removed."; break; ' 'fi; ' 'echo "Waiting for cluster {name} to be removed..."; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( @@ -152,7 +152,7 @@ 'done <<< "$current_status"; ' 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( @@ -167,6 +167,11 @@ 'sky queue {cluster_name}', 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') +# After the timeout, the cluster will stop if autostop is set, and our check +# should be more than the timeout. To address this, we extend the timeout by +# _BUMP_UP_SECONDS before exiting. +_BUMP_UP_SECONDS = 35 + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -2043,8 +2048,7 @@ def test_tpu(): f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. ], - 'echo "hello"', - #f'sky down -y {name}', + f'sky down -y {name}', timeout=30 * 60, # can take >20 mins ) run_one_test(test) @@ -2614,7 +2618,7 @@ def test_autostop(generic_cloud: str): _WAIT_UNTIL_CLUSTER_STATUS_IS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout), + timeout=autostop_timeout + _BUMP_UP_SECONDS), ], f'sky down -y {name}', timeout=total_timeout_minutes * 60, @@ -2951,7 +2955,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str): _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, job_status=f'{JobStatus.FAILED_SETUP.value}', - timeout=330), + timeout=330 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. From c464005216903f92e91cb7ca946318c31d50b33a Mon Sep 17 00:00:00 2001 From: zpoint Date: Sat, 16 Nov 2024 00:09:48 +0800 Subject: [PATCH 07/22] merge master and resolve conflict --- tests/test_smoke.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e254f6a0870..5aeb1f055fe 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -99,6 +99,8 @@ 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') # Cluster functions +_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) +_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status @@ -110,7 +112,8 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES + + ')$/) print \$i}}"); ' 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' @@ -141,7 +144,8 @@ 'fi; ' 'current_status=$(sky queue {cluster_name} | ' 'awk "\\$1 == \\"{job_id}\\" ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES + + ')$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line From c054edf56499a39ed42e1e62fba66b5f81411551 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 13:42:24 +0800 Subject: [PATCH 08/22] more test case --- tests/test_smoke.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5aeb1f055fe..434f0099b12 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -120,6 +120,11 @@ 'sleep 10; ' 'done') +_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace( + 'sky status {cluster_name}', + 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', + 'awk "/^{cluster_name_awk}/') + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout 'start_time=$SECONDS; ' @@ -530,6 +535,7 @@ def test_aws_region(): @pytest.mark.aws def test_aws_with_ssh_proxy_command(): name = _get_cluster_name() + with tempfile.NamedTemporaryFile(mode='w') as f: f.write( textwrap.dedent(f"""\ @@ -551,10 +557,18 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - 'timeout 300s bash -c "until sky status sky-jobs-controller* | grep UP; do sleep 1; done"', + _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format( + cluster_name=f'sky-jobs-controller-*', + cluster_name_awk='sky-jobs-controller-.*', + cluster_status=ClusterStatus.UP.value, + timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name} | grep "STARTING\|RUNNING\|SUCCEEDED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. + format( + job_name=name, + job_status= + f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})', + timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', ) @@ -1817,6 +1831,7 @@ def test_large_job_queue(generic_cloud: str): f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', 'sleep 90', + # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', From 8675df39250be8db57593ad7e4d99ca1e6b13a24 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 16:44:15 +0800 Subject: [PATCH 09/22] support test_managed_jobs_pipeline_failed_setup --- tests/test_smoke.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 434f0099b12..0b86aaa7227 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -173,8 +173,9 @@ # Managed job functions _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( - 'sky queue {cluster_name}', - 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') + 'sky queue {cluster_name}', 'sky jobs queue').replace( + 'awk "\\$2 == \\"{job_name}\\"', + 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"') # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by @@ -3021,7 +3022,10 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): 'managed_jobs_pipeline_failed_setup', [ f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - 'sleep 600', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{JobStatus.FAILED_SETUP.value}', + timeout=600), # Make sure the job failed quickly. f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', # Task 0 should be SUCCEEDED. From 7e7c055d1b74464021f7b88b4daf1cfd46d4b9e5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:08:34 +0800 Subject: [PATCH 10/22] support test_managed_jobs_recovery_aws --- tests/test_smoke.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 0b86aaa7227..b22643ec439 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3059,8 +3059,8 @@ def test_managed_jobs_recovery_aws(aws_config_region): 'managed_jobs_recovery_aws', [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, job_status=JobStatus.RUNNING.value, timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3070,8 +3070,8 @@ def test_managed_jobs_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, job_status=JobStatus.RUNNING.value, timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', From f631cd3151eab76e2b04bddf930372fbf7daa27a Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:55:16 +0800 Subject: [PATCH 11/22] manged job status --- tests/test_smoke.py | 69 ++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b22643ec439..d3f0e0b6adc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -59,6 +59,7 @@ from sky.data import data_utils from sky.data import storage as storage_lib from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus from sky.skylet import constants from sky.skylet import events from sky.skylet.job_lib import JobStatus @@ -101,6 +102,8 @@ # Cluster functions _ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) _ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) +_ALL_MANAGED_JOB_STATUSES = "|".join( + [status.value for status in ManagedJobStatus]) _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status @@ -175,7 +178,8 @@ _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( 'sky queue {cluster_name}', 'sky jobs queue').replace( 'awk "\\$2 == \\"{job_name}\\"', - 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"') + 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( + _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by @@ -568,7 +572,7 @@ def test_aws_with_ssh_proxy_command(): format( job_name=name, job_status= - f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})', timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', @@ -2914,17 +2918,17 @@ def test_managed_jobs(generic_cloud: str): _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-1', job_status= - f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', timeout=60), _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-2', job_status= - f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', timeout=60), f'sky jobs cancel -y -n {name}-1', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-1', - job_status=f'{JobStatus.CANCELLED.value}', + job_status=f'{ManagedJobStatus.CANCELLED.value}', timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', @@ -2998,7 +3002,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str): # Make sure the job failed quickly. _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, - job_status=f'{JobStatus.FAILED_SETUP.value}', + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', timeout=330 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -3024,7 +3028,7 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, - job_status=f'{JobStatus.FAILED_SETUP.value}', + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', timeout=600), # Make sure the job failed quickly. f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', @@ -3060,7 +3064,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, job_status=JobStatus.RUNNING.value, timeout=600), + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3071,7 +3077,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, job_status=JobStatus.RUNNING.value, timeout=200), + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3099,15 +3107,19 @@ def test_managed_jobs_recovery_gcp(): 'managed_jobs_recovery_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=300), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3130,8 +3142,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): 'managed_jobs_pipeline_recovery_aws', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -3150,8 +3164,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', @@ -3181,8 +3197,10 @@ def test_managed_jobs_pipeline_recovery_gcp(): 'managed_jobs_pipeline_recovery_gcp', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -3193,8 +3211,10 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', @@ -3220,8 +3240,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str): 'managed-spot-recovery-default-resources', [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|RECOVERING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', + timeout=360), ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, From d822c4b1ee53fa64849343cdf62c27a5df017ba9 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:58:28 +0800 Subject: [PATCH 12/22] bug fix --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d3f0e0b6adc..799ff805faf 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -572,7 +572,7 @@ def test_aws_with_ssh_proxy_command(): format( job_name=name, job_status= - f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})', + f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', From 9d8194e33ec88649f862ccb5ba041a086dfab857 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 18:16:24 +0800 Subject: [PATCH 13/22] test managed job cancel --- tests/test_smoke.py | 110 +++++++++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 799ff805faf..8792b106ea8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3264,8 +3264,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'managed_jobs_recovery_multi_node_aws', [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 450', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=450), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3276,8 +3278,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 560', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3305,15 +3309,19 @@ def test_managed_jobs_recovery_multi_node_gcp(): 'managed_jobs_recovery_multi_node_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 420', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3338,13 +3346,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - 'sleep 60', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', + timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3352,12 +3363,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): ), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - 'sleep 300', + # The job is set up in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3365,8 +3380,11 @@ def test_managed_jobs_cancellation_aws(aws_config_region): ), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', + # The job is running in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' @@ -3376,10 +3394,10 @@ def test_managed_jobs_cancellation_aws(aws_config_region): _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$(aws ec2 describe-instances --region {region} ' @@ -3414,34 +3432,42 @@ def test_managed_jobs_cancellation_gcp(): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - 'sleep 60', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - 'sleep 300', + # The job is set up in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' From 41dfbee2c5bc7d3ef90cb524e3c8e7911c6b63ad Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 18:28:25 +0800 Subject: [PATCH 14/22] test_managed_jobs_storage --- tests/test_smoke.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8792b106ea8..21b2c70cfbf 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3557,8 +3557,10 @@ def test_managed_jobs_storage(generic_cloud: str): *STORAGE_SETUP_COMMANDS, f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region - 'sleep 60', # Wait the spot queue to be updated - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. + format(job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=60 + _BUMP_UP_SECONDS), f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd From 6a13540d3134c5ba5f4d648e9c37e8a111f7a6f9 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 11:11:15 +0800 Subject: [PATCH 15/22] more test cases --- tests/test_smoke.py | 59 +++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 21b2c70cfbf..bf1178a6629 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -105,7 +105,7 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) -_WAIT_UNTIL_CLUSTER_STATUS_IS = ( +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. 'start_time=$SECONDS; ' @@ -123,7 +123,7 @@ 'sleep 10; ' 'done') -_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace( +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', 'awk "/^{cluster_name_awk}/') @@ -499,7 +499,7 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'sky status -r {name} | grep UP', # Ensure cluster is stopped - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -562,7 +562,7 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format( cluster_name=f'sky-jobs-controller-*', cluster_name_awk='sky-jobs-controller-.*', cluster_status=ClusterStatus.UP.value, @@ -943,7 +943,7 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=60), @@ -1060,7 +1060,7 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=80), @@ -1084,7 +1084,7 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=100), @@ -1115,7 +1115,7 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=40), @@ -2556,14 +2556,14 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status= f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', @@ -2590,7 +2590,7 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status= f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', @@ -2631,7 +2631,7 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -2650,7 +2650,7 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -2663,7 +2663,7 @@ def test_autostop(generic_cloud: str): f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout + _BUMP_UP_SECONDS), @@ -2883,7 +2883,7 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=90), @@ -2892,7 +2892,7 @@ def test_stop_gcp_spot(): f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=120), @@ -3584,10 +3584,16 @@ def test_managed_jobs_tpu(): 'test-spot-tpu', [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep STARTING', - 'sleep 900', # TPU takes a while to launch - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), + # TPU takes a while to launch + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', + timeout=900 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -3605,8 +3611,10 @@ def test_managed_jobs_inline_env(generic_cloud: str): 'test-managed-jobs-inline-env', [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - 'sleep 20', - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=20 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -3713,8 +3721,11 @@ def test_azure_start_stop_two_nodes(): f'sky start -y {name} -i 1', f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. - 'sleep 200', - f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"' + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', + timeout=200 + _BUMP_UP_SECONDS), f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], f'sky down -y {name}', From d83647fe1b897b5317bf42096a001b74d5db18e2 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 18:23:33 +0800 Subject: [PATCH 16/22] resolve pr comment --- tests/test_smoke.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index bf1178a6629..53a6e517b3b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -123,10 +123,19 @@ 'sleep 10; ' 'done') -_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( - 'sky status {cluster_name}', - 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', - 'awk "/^{cluster_name_awk}/') + +def get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard: str, cluster_status: str, timeout: int): + wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( + 'sky status {cluster_name}', + 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', + 'awk "/^{cluster_name_awk}/') + return wait_cmd.format(cluster_name=cluster_name_wildcard, + cluster_name_awk=cluster_name_wildcard.replace( + '*', '.*'), + cluster_status=cluster_status, + timeout=timeout) + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout @@ -562,9 +571,8 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format( - cluster_name=f'sky-jobs-controller-*', - cluster_name_awk='sky-jobs-controller-.*', + get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard='sky-jobs-controller-*', cluster_status=ClusterStatus.UP.value, timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', From 573e83efb3a1c73e52720535911cf043f4d8857e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 18:29:27 +0800 Subject: [PATCH 17/22] private member function --- tests/test_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 53a6e517b3b..8e54e9856a9 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -124,7 +124,7 @@ 'done') -def get_cmd_wait_until_cluster_status_contains_wildcard( +def _get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard: str, cluster_status: str, timeout: int): wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', @@ -571,7 +571,7 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - get_cmd_wait_until_cluster_status_contains_wildcard( + _get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', cluster_status=ClusterStatus.UP.value, timeout=300), From 1202d1a5637bb31c7c97fd86c2ac0e105763bc1d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 14:16:03 +0800 Subject: [PATCH 18/22] bug fix --- tests/test_smoke.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index fdc83fb2192..a629816cb22 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3735,7 +3735,7 @@ def test_azure_start_stop_two_nodes(): cluster_name=name, cluster_status= f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', - timeout=200 + _BUMP_UP_SECONDS), + timeout=200 + _BUMP_UP_SECONDS) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], f'sky down -y {name}', @@ -4746,7 +4746,10 @@ def test_core_api_sky_launch_fast(generic_cloud: str): idle_minutes_to_autostop=1, fast=True) # Sleep to let the cluster autostop - time.sleep(120) + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED, + timeout=120) # Run it again - should work with fast=True sky.launch(task, cluster_name=name, From 3cabfaa25ecacf44ce2ec16740305ff8fb91fe7d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sun, 24 Nov 2024 23:31:42 +0800 Subject: [PATCH 19/22] interface change --- tests/test_smoke.py | 315 +++++++++++++++++++++++++++----------------- 1 file changed, 191 insertions(+), 124 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index a629816cb22..b75e08c021e 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -105,6 +105,15 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) + +def _statuses_to_str(statuses: List[enum.Enum]): + """Convert a list of enums to a string with all the values separated by |.""" + if len(statuses) > 1: + return '(' + '|'.join([status.value for status in statuses]) + ')' + else: + return statuses[0].value + + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. @@ -124,8 +133,17 @@ 'done') +def _get_cmd_wait_until_cluster_status_contains( + cluster_name: str, cluster_status: List[ClusterStatus], timeout: int): + return _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=cluster_name, + cluster_status=(cluster_status), + timeout=timeout) + + def _get_cmd_wait_until_cluster_status_contains_wildcard( - cluster_name_wildcard: str, cluster_status: str, timeout: int): + cluster_name_wildcard: str, cluster_status: List[ClusterStatus], + timeout: int): wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', @@ -133,7 +151,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( return wait_cmd.format(cluster_name=cluster_name_wildcard, cluster_name_awk=cluster_name_wildcard.replace( '*', '.*'), - cluster_status=cluster_status, + cluster_status=_statuses_to_str(cluster_status), timeout=timeout) @@ -147,10 +165,16 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' ' echo "Cluster {cluster_name} successfully removed."; break; ' 'fi; ' - 'echo "Waiting for cluster {name} to be removed..."; ' + 'echo "Waiting for cluster {cluster_name} to be removed..."; ' 'sleep 10; ' 'done') + +def _get_cmd_wait_until_cluster_is_not_found(cluster_name: str, timeout: int): + return _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=cluster_name, + timeout=timeout) + + _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( # A while loop to wait until the job status # contains certain status, with timeout. @@ -182,6 +206,35 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') + +def _get_cmd_wait_until_job_status_contains_matching_job_id( + cluster_name: str, job_id: str, job_status: List[JobStatus], + timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=cluster_name, + job_id=job_id, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + +def _get_cmd_wait_until_job_status_contains_without_matching_job( + cluster_name: str, job_status: List[JobStatus], timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + cluster_name=cluster_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + +def _get_cmd_wait_until_job_status_contains_matching_job_name( + cluster_name: str, job_name: str, job_status: List[JobStatus], + timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + cluster_name=cluster_name, + job_name=job_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + # Managed job functions _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( @@ -190,6 +243,15 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) + +def _get_cmd_wait_until_managed_job_status_contains_matching_job_name( + job_name: str, job_status: List[JobStatus], timeout: int): + return _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=job_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by # _BUMP_UP_SECONDS before exiting. @@ -508,9 +570,9 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'sky status -r {name} | grep UP', # Ensure cluster is stopped - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Launch again. Do full output validation - we expect the cluster to re-launch @@ -573,14 +635,15 @@ def test_aws_with_ssh_proxy_command(): # the job controller is not launched with proxy command. _get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', - cluster_status=ClusterStatus.UP.value, + cluster_status=[ClusterStatus.UP], timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. - format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', + job_status=[ + ManagedJobStatus.SUCCEEDED, ManagedJobStatus.RUNNING, + ManagedJobStatus.STARTING + ], timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', @@ -951,9 +1014,9 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=60), # Wait for EC2 instance to be in stopped state. # TODO: event based wait. @@ -1003,8 +1066,8 @@ def test_gcp_mig(): # Check MIG exists. f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', f'sky autostop -i 0 --down -y {name}', - _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, - timeout=120), + _get_cmd_wait_until_cluster_is_not_found(cluster_name=name, + timeout=120), f'gcloud compute instance-templates list | grep "sky-it-{name}"', # Launch again with the same region. The original instance template # should be removed. @@ -1071,9 +1134,9 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=80), f'sky start -y {name}', f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', @@ -1095,9 +1158,9 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=100), f'sky start {name} -y', f'sky logs {name} 1 --status', @@ -1126,17 +1189,17 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=40), f'sky launch -c {name} -y "echo hi"', f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + _get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, + job_status=[JobStatus.FAILED_DRIVER], timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), ], f'sky down -y {name}', @@ -1167,9 +1230,9 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + _get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, + job_status=[JobStatus.FAILED_DRIVER], timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) ], f'sky down -y {name}', @@ -1990,10 +2053,10 @@ def test_multi_echo(generic_cloud: str): ] + # Ensure jobs succeeded. [ - _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + _get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, job_id=i + 1, - job_status=JobStatus.SUCCEEDED.value, + job_status=[JobStatus.SUCCEEDED], timeout=120) for i in range(32) ] + # Ensure monitor/autoscaler didn't crash on the 'assert not @@ -2567,17 +2630,16 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT], timeout=200), ], f'sky down -y {name}', @@ -2601,10 +2663,9 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT], timeout=280) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', ], @@ -2642,9 +2703,9 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Ensure the cluster is UP and the autostop setting is reset ('-'). @@ -2661,9 +2722,9 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Test restarting the idleness timer via exec: @@ -2673,9 +2734,9 @@ def test_autostop(generic_cloud: str): 'sleep 45', # Almost reached the threshold. f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout + _BUMP_UP_SECONDS), ], f'sky down -y {name}', @@ -2893,18 +2954,18 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=90), f'sky start {name} -y', f'sky exec {name} -- ls myfile', f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=120), ], f'sky down -y {name}', @@ -2925,20 +2986,24 @@ def test_managed_jobs(generic_cloud: str): [ f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.PENDING, ManagedJobStatus.INIT, + ManagedJobStatus.RUNNING + ], timeout=60), - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.PENDING, ManagedJobStatus.INIT, + ManagedJobStatus.RUNNING + ], timeout=60), f'sky jobs cancel -y -n {name}-1', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', - job_status=f'{ManagedJobStatus.CANCELLED.value}', + job_status=[ManagedJobStatus.CANCELLED], timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', @@ -3010,9 +3075,9 @@ def test_managed_jobs_failed_setup(generic_cloud: str): [ f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', # Make sure the job failed quickly. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', + job_status=[ManagedJobStatus.FAILED_SETUP], timeout=330 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -3036,9 +3101,9 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): 'managed_jobs_pipeline_failed_setup', [ f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', + job_status=[ManagedJobStatus.FAILED_SETUP], timeout=600), # Make sure the job failed quickly. f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', @@ -3073,9 +3138,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): 'managed_jobs_recovery_aws', [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. @@ -3086,9 +3151,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], @@ -3117,18 +3182,18 @@ def test_managed_jobs_recovery_gcp(): 'managed_jobs_recovery_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], @@ -3152,9 +3217,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): 'managed_jobs_pipeline_recovery_aws', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -3174,9 +3239,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -3207,9 +3272,9 @@ def test_managed_jobs_pipeline_recovery_gcp(): 'managed_jobs_pipeline_recovery_gcp', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -3221,9 +3286,9 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -3250,10 +3315,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str): 'managed-spot-recovery-default-resources', [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', + job_status=[ + ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING + ], timeout=360), ], f'sky jobs cancel -y -n {name}', @@ -3274,9 +3340,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'managed_jobs_recovery_multi_node_aws', [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=450), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. @@ -3288,9 +3354,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], @@ -3319,18 +3385,18 @@ def test_managed_jobs_recovery_multi_node_gcp(): 'managed_jobs_recovery_multi_node_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], @@ -3356,15 +3422,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING + ], timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' @@ -3374,14 +3441,14 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' @@ -3391,9 +3458,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', # The job is running in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3404,9 +3471,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region): _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. @@ -3442,41 +3509,41 @@ def test_managed_jobs_cancellation_gcp(): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.STARTING.value, + job_status=[ManagedJobStatus.STARTING], timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. @@ -3567,10 +3634,10 @@ def test_managed_jobs_storage(generic_cloud: str): *STORAGE_SETUP_COMMANDS, f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. - format(job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, - timeout=60 + _BUMP_UP_SECONDS), + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( + job_name=name, + job_status=[ManagedJobStatus.SUCCEEDED], + timeout=60 + _BUMP_UP_SECONDS), f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd @@ -3594,15 +3661,16 @@ def test_managed_jobs_tpu(): 'test-spot-tpu', [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.STARTING.value, + job_status=[ManagedJobStatus.STARTING], timeout=60 + _BUMP_UP_SECONDS), # TPU takes a while to launch - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', + job_status=[ + ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED + ], timeout=900 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -3621,9 +3689,9 @@ def test_managed_jobs_inline_env(generic_cloud: str): 'test-managed-jobs-inline-env', [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, + job_status=[ManagedJobStatus.SUCCEEDED], timeout=20 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -3731,10 +3799,9 @@ def test_azure_start_stop_two_nodes(): f'sky start -y {name} -i 1', f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', + cluster_status=[ClusterStatus.INIT, ClusterStatus.STOPPED], timeout=200 + _BUMP_UP_SECONDS) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], @@ -4746,9 +4813,9 @@ def test_core_api_sky_launch_fast(generic_cloud: str): idle_minutes_to_autostop=1, fast=True) # Sleep to let the cluster autostop - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + _get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED, + cluster_status=[ClusterStatus.STOPPED], timeout=120) # Run it again - should work with fast=True sky.launch(task, From c71795c66bf0baf1d2afee205c5fe9719e1c840d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 13:09:33 +0800 Subject: [PATCH 20/22] bug fix --- tests/test_smoke.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b75e08c021e..f394a739f12 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -574,7 +574,9 @@ def test_launch_fast_with_autostop(generic_cloud: str): cluster_name=name, cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), - + # Even the cluster is stopped, cloud platform may take a while to + # delete the VM. + f'sleep {_BUMP_UP_SECONDS}', # Launch again. Do full output validation - we expect the cluster to re-launch f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', @@ -2989,15 +2991,15 @@ def test_managed_jobs(generic_cloud: str): _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.INIT, - ManagedJobStatus.RUNNING + ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING ], timeout=60), _get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.INIT, - ManagedJobStatus.RUNNING + ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING ], timeout=60), f'sky jobs cancel -y -n {name}-1', From cbfc430d8a0a5ce2ad597232447c23173010b7d7 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 13:22:07 +0800 Subject: [PATCH 21/22] bug fix --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f394a739f12..90a98658e57 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -137,7 +137,7 @@ def _get_cmd_wait_until_cluster_status_contains( cluster_name: str, cluster_status: List[ClusterStatus], timeout: int): return _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=cluster_name, - cluster_status=(cluster_status), + cluster_status=_statuses_to_str(cluster_status), timeout=timeout) From 52fcbe850dd7cb2f2bb80843bc61ccddada95816 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 26 Nov 2024 17:33:34 +0800 Subject: [PATCH 22/22] raise error on empty status --- tests/test_smoke.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 90a98658e57..574dae21ea0 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -108,6 +108,7 @@ def _statuses_to_str(statuses: List[enum.Enum]): """Convert a list of enums to a string with all the values separated by |.""" + assert len(statuses) > 0, 'statuses must not be empty' if len(statuses) > 1: return '(' + '|'.join([status.value for status in statuses]) + ')' else: