From 468409c51d3daf8a0df6632504a8cff0f324429e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Thu, 7 Nov 2024 17:03:08 +0800 Subject: [PATCH 01/64] event based smoke test --- tests/test_smoke.py | 99 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index cdfd9dfc7cb..b51e720e84a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -25,6 +25,7 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws +import enum import inspect import json import os @@ -60,6 +61,8 @@ from sky.data.data_utils import Rclone from sky.skylet import constants from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus from sky.utils import common_utils from sky.utils import resources_utils from sky.utils import subprocess_utils @@ -95,6 +98,64 @@ 'sleep 10; s=$(sky jobs queue);' 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') +_WAIT_UNTIL_CLUSTER_STATUS_IS = ( + # A while loop to wait until the cluster status + # becomes certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky status {cluster_name} --refresh | ' + 'awk "/^{cluster_name}/ ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' + 'if [ "$current_status" == "{cluster_status}" ]; ' + 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' + 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' + 'sleep 30; ' + 'done') + +_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( + # A while loop to wait until the cluster is not found or timeout + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; ' + 'fi; ' + 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' + ' echo "Cluster {cluster_name} successfully removed."; break; ' + 'fi; ' + 'echo "Waiting for cluster {name} to be removed..."; ' + 'sleep 15; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS = ( + # A while loop to wait until the job status + # contains certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky queue {cluster_name} | ' + 'awk "/{job_name}/ ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' + 'found=0; ' # Initialize found variable outside the loop + 'while read -r line; do ' # Read line by line + ' if [ "$line" == "{job_status}" ]; then ' # Check each line + ' echo "Target job status \'{job_status}\' reached."; ' + ' found=1; ' + ' break; ' # Break inner loop + ' fi; ' + 'done <<< "$current_status"; ' + 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found + 'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; ' + 'sleep 15; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace( + 'awk "/{job_name}/', 'awk "') + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -399,7 +460,6 @@ def test_launch_fast_with_autostop(generic_cloud: str): # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure # the VM is stopped. autostop_timeout = 600 if generic_cloud == 'azure' else 250 - test = Test( 'test_launch_fast_with_autostop', [ @@ -407,10 +467,12 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', f'sky status -r {name} | grep UP', - f'sleep {autostop_timeout}', # Ensure cluster is stopped - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Launch again. Do full output validation - we expect the cluster to re-launch f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', @@ -808,7 +870,10 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - 'sleep 60', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=60), f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', f'sky logs {name}-clone 1 --status', @@ -854,8 +919,8 @@ def test_gcp_mig(): # Check MIG exists. f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', f'sky autostop -i 0 --down -y {name}', - 'sleep 120', - f'sky status -r {name}; sky status {name} | grep "{name} not found"', + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, + timeout=120), f'gcloud compute instance-templates list | grep "sky-it-{name}"', # Launch again with the same region. The original instance template # should be removed. @@ -922,8 +987,10 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - 'sleep 60', - f'sky status -r {name} | grep "STOPPED"', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=80), f'sky start -y {name}', f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', @@ -944,7 +1011,10 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - 'sleep 100', # Ensure this is large enough, else GCP leaks. + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=100), f'sky start {name} -y', f'sky logs {name} 1 --status', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', @@ -972,13 +1042,18 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - 'sleep 40', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), f'sky launch -c {name} -y "echo hi"', f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + cluster_name=name, + job_status=JobStatus.FAILED.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), ], f'sky down -y {name}', ) From 7191844ae2a7466897c75fc42ed9c116936a0db5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 17:14:31 +0800 Subject: [PATCH 02/64] more event based smoke test --- tests/test_smoke.py | 47 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b51e720e84a..a11ff9d8ed8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -108,11 +108,11 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' - 'if [ "$current_status" == "{cluster_status}" ]; ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); ' + 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' - 'sleep 30; ' + 'sleep 15; ' 'done') _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( @@ -129,7 +129,7 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS = ( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( # A while loop to wait until the job status # contains certain status, with timeout. 'start_time=$SECONDS; ' @@ -138,7 +138,7 @@ ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' 'fi; ' 'current_status=$(sky queue {cluster_name} | ' - 'awk "/{job_name}/ ' + 'awk "\\$1 == \\"{job_id}\\" ' '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line @@ -153,8 +153,11 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace( - 'awk "/{job_name}/', 'awk "') +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_name}\\"', 'awk "') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"') DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -1083,8 +1086,10 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + cluster_name=name, + job_status=JobStatus.FAILED.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) ], f'sky down -y {name}', ) @@ -1888,10 +1893,15 @@ def test_multi_echo(generic_cloud: str): 'multi_echo', [ f'python examples/multi_echo.py {name} {generic_cloud}', - 'sleep 120', ] + # Ensure jobs succeeded. - [f'sky logs {name} {i + 1} --status' for i in range(32)] + + [ + _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=name, + job_id=i + 1, + job_status=JobStatus.SUCCEEDED.value, + timeout=120) for i in range(32) + ] + # Ensure monitor/autoscaler didn't crash on the 'assert not # unfulfilled' error. If process not found, grep->ssh returns 1. [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], @@ -1984,7 +1994,8 @@ def test_tpu(): f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. ], - f'sky down -y {name}', + 'echo "hello"', + #f'sky down -y {name}', timeout=30 * 60, # can take >20 mins ) run_one_test(test) @@ -2444,12 +2455,18 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - f'sleep 20', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - 'sleep 180', - f'sky status -r {name} | grep "INIT\|STOPPED"', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=200), ], f'sky down -y {name}', ) From 5cbebebae882ff172917727b4aa00ab767bd986e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 17:56:37 +0800 Subject: [PATCH 03/64] more test cases --- tests/test_smoke.py | 53 ++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index a11ff9d8ed8..9c422cda194 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -108,10 +108,10 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' 'if [[ "$current_status" =~ {cluster_status} ]]; ' - 'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; ' - 'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; ' + 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' + 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' 'sleep 15; ' 'done') @@ -143,21 +143,21 @@ 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line ' if [ "$line" == "{job_status}" ]; then ' # Check each line - ' echo "Target job status \'{job_status}\' reached."; ' + ' echo "Target job status {job_status} reached."; ' ' found=1; ' ' break; ' # Break inner loop ' fi; ' 'done <<< "$current_status"; ' 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found - 'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; ' + 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' 'sleep 15; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_name}\\"', 'awk "') + 'awk "\\$1 == \\"{job_id}\\"', 'awk "') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"') + 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -2489,9 +2489,12 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - 'sleep 260', - f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"' - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=280) + + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', ], f'sky down -y {name}', timeout=30 * 60, # 30 mins @@ -2527,8 +2530,10 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Ensure the cluster is UP and the autostop setting is reset ('-'). f'sky start -y {name}', @@ -2544,8 +2549,10 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), # Test restarting the idleness timer via exec: f'sky start -y {name}', @@ -2555,8 +2562,10 @@ def test_autostop(generic_cloud: str): f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - f'sleep {autostop_timeout}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), ], f'sky down -y {name}', timeout=total_timeout_minutes * 60, @@ -2773,15 +2782,19 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - 'sleep 90', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=90), f'sky start {name} -y', f'sky exec {name} -- ls myfile', f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - 'sleep 120', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=120), ], f'sky down -y {name}', ) From 6f6840901b8407be2e20d9093565813029b2f83e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 18:43:20 +0800 Subject: [PATCH 04/64] more test cases with managed jobs --- tests/test_smoke.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9c422cda194..339d7062b0a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -98,6 +98,8 @@ 'sleep 10; s=$(sky jobs queue);' 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') +# Cluster functions + _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. @@ -142,7 +144,7 @@ '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line - ' if [ "$line" == "{job_status}" ]; then ' # Check each line + ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line ' echo "Target job status {job_status} reached."; ' ' found=1; ' ' break; ' # Break inner loop @@ -153,12 +155,18 @@ 'sleep 15; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') +# Managed job functions + +_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( + 'sky queue {cluster_name}', + 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -1053,7 +1061,7 @@ def test_aws_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( cluster_name=name, job_status=JobStatus.FAILED.value, timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), @@ -1086,7 +1094,7 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( cluster_name=name, job_status=JobStatus.FAILED.value, timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) @@ -2814,14 +2822,21 @@ def test_managed_jobs(generic_cloud: str): [ f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-1', + job_status= + f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + timeout=60), + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status= + f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + timeout=60), f'sky jobs cancel -y -n {name}-1', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep CANCELLED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=f'{JobStatus.CANCELLED.value}', + timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', @@ -2891,9 +2906,11 @@ def test_managed_jobs_failed_setup(generic_cloud: str): 'managed_jobs_failed_setup', [ f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', - 'sleep 330', # Make sure the job failed quickly. - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{JobStatus.FAILED_SETUP.value}', + timeout=330), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. From 1f67691aec7a6d66cf7733190e7ce5a142c361cb Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 11 Nov 2024 18:58:41 +0800 Subject: [PATCH 05/64] bug fix --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 339d7062b0a..043cb63ea96 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2834,7 +2834,7 @@ def test_managed_jobs(generic_cloud: str): timeout=60), f'sky jobs cancel -y -n {name}-1', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', + job_name=f'{name}-1', job_status=f'{JobStatus.CANCELLED.value}', timeout=230), # Test the functionality for logging. From be7964ece6275ca782c17d50c5f8db5187cf9bfd Mon Sep 17 00:00:00 2001 From: zepingguo Date: Wed, 13 Nov 2024 16:41:13 +0800 Subject: [PATCH 06/64] bump up seconds --- tests/test_smoke.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e6daae0e588..7d415708cfc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -114,7 +114,7 @@ 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( @@ -128,7 +128,7 @@ ' echo "Cluster {cluster_name} successfully removed."; break; ' 'fi; ' 'echo "Waiting for cluster {name} to be removed..."; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( @@ -152,7 +152,7 @@ 'done <<< "$current_status"; ' 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' - 'sleep 15; ' + 'sleep 10; ' 'done') _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( @@ -167,6 +167,11 @@ 'sky queue {cluster_name}', 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') +# After the timeout, the cluster will stop if autostop is set, and our check +# should be more than the timeout. To address this, we extend the timeout by +# _BUMP_UP_SECONDS before exiting. +_BUMP_UP_SECONDS = 35 + DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -2043,8 +2048,7 @@ def test_tpu(): f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. ], - 'echo "hello"', - #f'sky down -y {name}', + f'sky down -y {name}', timeout=30 * 60, # can take >20 mins ) run_one_test(test) @@ -2614,7 +2618,7 @@ def test_autostop(generic_cloud: str): _WAIT_UNTIL_CLUSTER_STATUS_IS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout), + timeout=autostop_timeout + _BUMP_UP_SECONDS), ], f'sky down -y {name}', timeout=total_timeout_minutes * 60, @@ -2951,7 +2955,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str): _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, job_status=f'{JobStatus.FAILED_SETUP.value}', - timeout=330), + timeout=330 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. From c464005216903f92e91cb7ca946318c31d50b33a Mon Sep 17 00:00:00 2001 From: zpoint Date: Sat, 16 Nov 2024 00:09:48 +0800 Subject: [PATCH 07/64] merge master and resolve conflict --- tests/test_smoke.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e254f6a0870..5aeb1f055fe 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -99,6 +99,8 @@ 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') # Cluster functions +_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) +_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status @@ -110,7 +112,8 @@ 'fi; ' 'current_status=$(sky status {cluster_name} --refresh | ' 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES + + ')$/) print \$i}}"); ' 'if [[ "$current_status" =~ {cluster_status} ]]; ' 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' @@ -141,7 +144,8 @@ 'fi; ' 'current_status=$(sky queue {cluster_name} | ' 'awk "\\$1 == \\"{job_id}\\" ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES + + ')$/) print \$i}}"); ' 'found=0; ' # Initialize found variable outside the loop 'while read -r line; do ' # Read line by line ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line From c054edf56499a39ed42e1e62fba66b5f81411551 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 13:42:24 +0800 Subject: [PATCH 08/64] more test case --- tests/test_smoke.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5aeb1f055fe..434f0099b12 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -120,6 +120,11 @@ 'sleep 10; ' 'done') +_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace( + 'sky status {cluster_name}', + 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', + 'awk "/^{cluster_name_awk}/') + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout 'start_time=$SECONDS; ' @@ -530,6 +535,7 @@ def test_aws_region(): @pytest.mark.aws def test_aws_with_ssh_proxy_command(): name = _get_cluster_name() + with tempfile.NamedTemporaryFile(mode='w') as f: f.write( textwrap.dedent(f"""\ @@ -551,10 +557,18 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - 'timeout 300s bash -c "until sky status sky-jobs-controller* | grep UP; do sleep 1; done"', + _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format( + cluster_name=f'sky-jobs-controller-*', + cluster_name_awk='sky-jobs-controller-.*', + cluster_status=ClusterStatus.UP.value, + timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name} | grep "STARTING\|RUNNING\|SUCCEEDED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. + format( + job_name=name, + job_status= + f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})', + timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', ) @@ -1817,6 +1831,7 @@ def test_large_job_queue(generic_cloud: str): f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', 'sleep 90', + # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', From 8675df39250be8db57593ad7e4d99ca1e6b13a24 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 16:44:15 +0800 Subject: [PATCH 09/64] support test_managed_jobs_pipeline_failed_setup --- tests/test_smoke.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 434f0099b12..0b86aaa7227 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -173,8 +173,9 @@ # Managed job functions _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( - 'sky queue {cluster_name}', - 'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ') + 'sky queue {cluster_name}', 'sky jobs queue').replace( + 'awk "\\$2 == \\"{job_name}\\"', + 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"') # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by @@ -3021,7 +3022,10 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): 'managed_jobs_pipeline_failed_setup', [ f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - 'sleep 600', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{JobStatus.FAILED_SETUP.value}', + timeout=600), # Make sure the job failed quickly. f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', # Task 0 should be SUCCEEDED. From 7e7c055d1b74464021f7b88b4daf1cfd46d4b9e5 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:08:34 +0800 Subject: [PATCH 10/64] support test_managed_jobs_recovery_aws --- tests/test_smoke.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 0b86aaa7227..b22643ec439 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3059,8 +3059,8 @@ def test_managed_jobs_recovery_aws(aws_config_region): 'managed_jobs_recovery_aws', [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, job_status=JobStatus.RUNNING.value, timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3070,8 +3070,8 @@ def test_managed_jobs_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, job_status=JobStatus.RUNNING.value, timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', From f631cd3151eab76e2b04bddf930372fbf7daa27a Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:55:16 +0800 Subject: [PATCH 11/64] manged job status --- tests/test_smoke.py | 69 ++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b22643ec439..d3f0e0b6adc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -59,6 +59,7 @@ from sky.data import data_utils from sky.data import storage as storage_lib from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus from sky.skylet import constants from sky.skylet import events from sky.skylet.job_lib import JobStatus @@ -101,6 +102,8 @@ # Cluster functions _ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) _ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) +_ALL_MANAGED_JOB_STATUSES = "|".join( + [status.value for status in ManagedJobStatus]) _WAIT_UNTIL_CLUSTER_STATUS_IS = ( # A while loop to wait until the cluster status @@ -175,7 +178,8 @@ _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( 'sky queue {cluster_name}', 'sky jobs queue').replace( 'awk "\\$2 == \\"{job_name}\\"', - 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"') + 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( + _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by @@ -568,7 +572,7 @@ def test_aws_with_ssh_proxy_command(): format( job_name=name, job_status= - f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})', timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', @@ -2914,17 +2918,17 @@ def test_managed_jobs(generic_cloud: str): _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-1', job_status= - f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', timeout=60), _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-2', job_status= - f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})', + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', timeout=60), f'sky jobs cancel -y -n {name}-1', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=f'{name}-1', - job_status=f'{JobStatus.CANCELLED.value}', + job_status=f'{ManagedJobStatus.CANCELLED.value}', timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', @@ -2998,7 +3002,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str): # Make sure the job failed quickly. _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, - job_status=f'{JobStatus.FAILED_SETUP.value}', + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', timeout=330 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -3024,7 +3028,7 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, - job_status=f'{JobStatus.FAILED_SETUP.value}', + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', timeout=600), # Make sure the job failed quickly. f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', @@ -3060,7 +3064,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, job_status=JobStatus.RUNNING.value, timeout=600), + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3071,7 +3077,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, job_status=JobStatus.RUNNING.value, timeout=200), + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3099,15 +3107,19 @@ def test_managed_jobs_recovery_gcp(): 'managed_jobs_recovery_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=300), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3130,8 +3142,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): 'managed_jobs_pipeline_recovery_aws', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -3150,8 +3164,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', @@ -3181,8 +3197,10 @@ def test_managed_jobs_pipeline_recovery_gcp(): 'managed_jobs_pipeline_recovery_gcp', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -3193,8 +3211,10 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', @@ -3220,8 +3240,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str): 'managed-spot-recovery-default-resources', [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - 'sleep 360', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|RECOVERING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', + timeout=360), ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, From d822c4b1ee53fa64849343cdf62c27a5df017ba9 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 17:58:28 +0800 Subject: [PATCH 12/64] bug fix --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d3f0e0b6adc..799ff805faf 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -572,7 +572,7 @@ def test_aws_with_ssh_proxy_command(): format( job_name=name, job_status= - f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})', + f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', From 9d8194e33ec88649f862ccb5ba041a086dfab857 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 18:16:24 +0800 Subject: [PATCH 13/64] test managed job cancel --- tests/test_smoke.py | 110 +++++++++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 799ff805faf..8792b106ea8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3264,8 +3264,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'managed_jobs_recovery_multi_node_aws', [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 450', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=450), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -3276,8 +3278,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): '--output text)'), _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 560', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3305,15 +3309,19 @@ def test_managed_jobs_recovery_multi_node_gcp(): 'managed_jobs_recovery_multi_node_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - 'sleep 400', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - 'sleep 420', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', @@ -3338,13 +3346,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - 'sleep 60', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', + timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3352,12 +3363,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): ), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - 'sleep 300', + # The job is set up in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3365,8 +3380,11 @@ def test_managed_jobs_cancellation_aws(aws_config_region): ), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', + # The job is running in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' @@ -3376,10 +3394,10 @@ def test_managed_jobs_cancellation_aws(aws_config_region): _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$(aws ec2 describe-instances --region {region} ' @@ -3414,34 +3432,42 @@ def test_managed_jobs_cancellation_gcp(): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - 'sleep 60', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - 'sleep 300', + # The job is set up in the cluster, will shown as RUNNING. + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - 'sleep 300', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', - 'sleep 120', - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' From 41dfbee2c5bc7d3ef90cb524e3c8e7911c6b63ad Mon Sep 17 00:00:00 2001 From: zepingguo Date: Mon, 18 Nov 2024 18:28:25 +0800 Subject: [PATCH 14/64] test_managed_jobs_storage --- tests/test_smoke.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8792b106ea8..21b2c70cfbf 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3557,8 +3557,10 @@ def test_managed_jobs_storage(generic_cloud: str): *STORAGE_SETUP_COMMANDS, f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region - 'sleep 60', # Wait the spot queue to be updated - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. + format(job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=60 + _BUMP_UP_SECONDS), f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd From 6a13540d3134c5ba5f4d648e9c37e8a111f7a6f9 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 11:11:15 +0800 Subject: [PATCH 15/64] more test cases --- tests/test_smoke.py | 59 +++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 21b2c70cfbf..bf1178a6629 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -105,7 +105,7 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) -_WAIT_UNTIL_CLUSTER_STATUS_IS = ( +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. 'start_time=$SECONDS; ' @@ -123,7 +123,7 @@ 'sleep 10; ' 'done') -_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace( +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', 'awk "/^{cluster_name_awk}/') @@ -499,7 +499,7 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'sky status -r {name} | grep UP', # Ensure cluster is stopped - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -562,7 +562,7 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format( cluster_name=f'sky-jobs-controller-*', cluster_name_awk='sky-jobs-controller-.*', cluster_status=ClusterStatus.UP.value, @@ -943,7 +943,7 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=60), @@ -1060,7 +1060,7 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=80), @@ -1084,7 +1084,7 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=100), @@ -1115,7 +1115,7 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=40), @@ -2556,14 +2556,14 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status= f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', @@ -2590,7 +2590,7 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status= f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', @@ -2631,7 +2631,7 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -2650,7 +2650,7 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), @@ -2663,7 +2663,7 @@ def test_autostop(generic_cloud: str): f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout + _BUMP_UP_SECONDS), @@ -2883,7 +2883,7 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=90), @@ -2892,7 +2892,7 @@ def test_stop_gcp_spot(): f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_IS.format( + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=120), @@ -3584,10 +3584,16 @@ def test_managed_jobs_tpu(): 'test-spot-tpu', [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep STARTING', - 'sleep 900', # TPU takes a while to launch - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), + # TPU takes a while to launch + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', + timeout=900 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -3605,8 +3611,10 @@ def test_managed_jobs_inline_env(generic_cloud: str): 'test-managed-jobs-inline-env', [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - 'sleep 20', - f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=20 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -3713,8 +3721,11 @@ def test_azure_start_stop_two_nodes(): f'sky start -y {name} -i 1', f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. - 'sleep 200', - f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"' + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', + timeout=200 + _BUMP_UP_SECONDS), f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], f'sky down -y {name}', From d83647fe1b897b5317bf42096a001b74d5db18e2 Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 18:23:33 +0800 Subject: [PATCH 16/64] resolve pr comment --- tests/test_smoke.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index bf1178a6629..53a6e517b3b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -123,10 +123,19 @@ 'sleep 10; ' 'done') -_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( - 'sky status {cluster_name}', - 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', - 'awk "/^{cluster_name_awk}/') + +def get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard: str, cluster_status: str, timeout: int): + wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( + 'sky status {cluster_name}', + 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', + 'awk "/^{cluster_name_awk}/') + return wait_cmd.format(cluster_name=cluster_name_wildcard, + cluster_name_awk=cluster_name_wildcard.replace( + '*', '.*'), + cluster_status=cluster_status, + timeout=timeout) + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout @@ -562,9 +571,8 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format( - cluster_name=f'sky-jobs-controller-*', - cluster_name_awk='sky-jobs-controller-.*', + get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard='sky-jobs-controller-*', cluster_status=ClusterStatus.UP.value, timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', From 573e83efb3a1c73e52720535911cf043f4d8857e Mon Sep 17 00:00:00 2001 From: zepingguo Date: Tue, 19 Nov 2024 18:29:27 +0800 Subject: [PATCH 17/64] private member function --- tests/test_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 53a6e517b3b..8e54e9856a9 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -124,7 +124,7 @@ 'done') -def get_cmd_wait_until_cluster_status_contains_wildcard( +def _get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard: str, cluster_status: str, timeout: int): wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', @@ -571,7 +571,7 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - get_cmd_wait_until_cluster_status_contains_wildcard( + _get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', cluster_status=ClusterStatus.UP.value, timeout=300), From 1202d1a5637bb31c7c97fd86c2ac0e105763bc1d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 14:16:03 +0800 Subject: [PATCH 18/64] bug fix --- tests/test_smoke.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index fdc83fb2192..a629816cb22 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3735,7 +3735,7 @@ def test_azure_start_stop_two_nodes(): cluster_name=name, cluster_status= f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', - timeout=200 + _BUMP_UP_SECONDS), + timeout=200 + _BUMP_UP_SECONDS) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], f'sky down -y {name}', @@ -4746,7 +4746,10 @@ def test_core_api_sky_launch_fast(generic_cloud: str): idle_minutes_to_autostop=1, fast=True) # Sleep to let the cluster autostop - time.sleep(120) + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED, + timeout=120) # Run it again - should work with fast=True sky.launch(task, cluster_name=name, From 87d7f1248730e0f2921ba1fd9dc558d22ddb4554 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 16:55:16 +0800 Subject: [PATCH 19/64] restructure --- tests/smoke_tests/__init__.py | 2 + tests/smoke_tests/test_basic.py | 206 ++++ tests/smoke_tests/test_images.py | 472 ++++++++++ tests/smoke_tests/test_region_and_zone.py | 267 ++++++ tests/{ => smoke_tests}/test_smoke.py | 1035 +-------------------- tests/smoke_tests/util.py | 381 ++++++++ 6 files changed, 1349 insertions(+), 1014 deletions(-) create mode 100644 tests/smoke_tests/__init__.py create mode 100644 tests/smoke_tests/test_basic.py create mode 100644 tests/smoke_tests/test_images.py create mode 100644 tests/smoke_tests/test_region_and_zone.py rename tests/{ => smoke_tests}/test_smoke.py (83%) create mode 100644 tests/smoke_tests/util.py diff --git a/tests/smoke_tests/__init__.py b/tests/smoke_tests/__init__.py new file mode 100644 index 00000000000..7f91740c201 --- /dev/null +++ b/tests/smoke_tests/__init__.py @@ -0,0 +1,2 @@ +"""For smoke tests import.""" +__all__ = ['util'] diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py new file mode 100644 index 00000000000..9d8a1225e42 --- /dev/null +++ b/tests/smoke_tests/test_basic.py @@ -0,0 +1,206 @@ +# Smoke tests for SkyPilot +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/test_smoke.py +# +# Terminate failed clusters after test finishes +# > pytest tests/test_smoke.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/test_smoke.py::test_minimal +# +# Only run managed job tests +# > pytest tests/test_smoke.py --managed-jobs +# +# Only run sky serve tests +# > pytest tests/test_smoke.py --sky-serve +# +# Only run test for AWS + generic tests +# > pytest tests/test_smoke.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/test_smoke.py --generic-cloud aws + +import enum +import inspect +import json +import os +import pathlib +import shlex +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +from typing import Dict, List, NamedTuple, Optional, Tuple +import urllib.parse +import uuid + +import colorama +import jinja2 +import pytest +from smoke_tests.util import _get_cluster_name +from smoke_tests.util import ( + _get_cmd_wait_until_cluster_status_contains_wildcard) +from smoke_tests.util import _GET_JOB_QUEUE +from smoke_tests.util import _get_timeout +from smoke_tests.util import _JOB_WAIT_NOT_RUNNING +from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID +from smoke_tests.util import ( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) +from smoke_tests.util import ( + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) +from smoke_tests.util import FLUIDSTACK_TYPE +from smoke_tests.util import LAMBDA_TYPE +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_GPU_V100 +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test + +import sky +from sky import global_user_state +from sky import jobs +from sky import serve +from sky import skypilot_config +from sky.adaptors import azure +from sky.adaptors import cloudflare +from sky.adaptors import ibm +from sky.clouds import AWS +from sky.clouds import Azure +from sky.clouds import GCP +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus +from sky.skylet import constants +from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus +from sky.utils import common_utils +from sky.utils import resources_utils +from sky.utils import subprocess_utils + + +# ---------- Dry run: 2 Tasks in a chain. ---------- +@pytest.mark.no_fluidstack #requires GCP and AWS set up +def test_example_app(): + test = Test( + 'example_app', + ['python examples/example_app.py'], + ) + run_one_test(test) + + +# ---------- A minimal task ---------- +def test_minimal(generic_cloud: str): + name = _get_cluster_name() + test = Test( + 'minimal', + [ + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + # Output validation done. + f'sky logs {name} 1 --status', + f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. + # Test launch output again on existing cluster + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + # Check the logs downloading + f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log', + # Ensure the raylet process has the correct file descriptor limit. + f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + # Install jq for the next test. + f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', + # Check the cluster info + f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', + f'sky logs {name} 5 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', + f'sky logs {name} 6 --status', # Ensure the job succeeded. + # Test '-c' for exec + f'sky exec -c {name} echo', + f'sky logs {name} 7 --status', + f'sky exec echo -c {name}', + f'sky logs {name} 8 --status', + f'sky exec -c {name} echo hi test', + f'sky logs {name} 9 | grep "hi test"', + f'sky exec {name} && exit 1 || true', + f'sky exec -c {name} && exit 1 || true', + ], + f'sky down -y {name}', + _get_timeout(generic_cloud), + ) + run_one_test(test) + + +# ---------- Test fast launch ---------- +def test_launch_fast(generic_cloud: str): + name = _get_cluster_name() + + test = Test( + 'test_launch_fast', + [ + # First launch to create the cluster + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 1 --status', + + # Second launch to test fast launch - should not reprovision + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && ' + ' echo "$s" && ' + # Validate that cluster was not re-launched. + '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' + # Validate that setup was not re-run. + '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' + # Validate that the task ran and finished. + 'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"', + f'sky logs {name} 2 --status', + f'sky status -r {name} | grep UP', + ], + f'sky down -y {name}', + timeout=_get_timeout(generic_cloud), + ) + run_one_test(test) + + +# See cloud exclusion explanations in test_autostop +@pytest.mark.no_fluidstack +@pytest.mark.no_lambda_cloud +@pytest.mark.no_ibm +@pytest.mark.no_kubernetes +def test_launch_fast_with_autostop(generic_cloud: str): + name = _get_cluster_name() + # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure + # the VM is stopped. + autostop_timeout = 600 if generic_cloud == 'azure' else 250 + test = Test( + 'test_launch_fast_with_autostop', + [ + # First launch to create the cluster with a short autostop + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 1 --status', + f'sky status -r {name} | grep UP', + + # Ensure cluster is stopped + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), + + # Launch again. Do full output validation - we expect the cluster to re-launch + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 2 --status', + f'sky status -r {name} | grep UP', + ], + f'sky down -y {name}', + timeout=_get_timeout(generic_cloud) + autostop_timeout, + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py new file mode 100644 index 00000000000..42438461f76 --- /dev/null +++ b/tests/smoke_tests/test_images.py @@ -0,0 +1,472 @@ +# Smoke tests for SkyPilot +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/test_smoke.py +# +# Terminate failed clusters after test finishes +# > pytest tests/test_smoke.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/test_smoke.py::test_minimal +# +# Only run managed job tests +# > pytest tests/test_smoke.py --managed-jobs +# +# Only run sky serve tests +# > pytest tests/test_smoke.py --sky-serve +# +# Only run test for AWS + generic tests +# > pytest tests/test_smoke.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/test_smoke.py --generic-cloud aws + +import enum +import inspect +import json +import os +import pathlib +import shlex +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +from typing import Dict, List, NamedTuple, Optional, Tuple +import urllib.parse +import uuid + +import colorama +import jinja2 +import pytest +from smoke_tests.util import _get_cluster_name +from smoke_tests.util import ( + _get_cmd_wait_until_cluster_status_contains_wildcard) +from smoke_tests.util import _GET_JOB_QUEUE +from smoke_tests.util import _get_timeout +from smoke_tests.util import _JOB_WAIT_NOT_RUNNING +from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID +from smoke_tests.util import ( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) +from smoke_tests.util import ( + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) +from smoke_tests.util import FLUIDSTACK_TYPE +from smoke_tests.util import LAMBDA_TYPE +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_GPU_V100 +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test + +import sky +from sky import global_user_state +from sky import jobs +from sky import serve +from sky import skypilot_config +from sky.adaptors import azure +from sky.adaptors import cloudflare +from sky.adaptors import ibm +from sky.clouds import AWS +from sky.clouds import Azure +from sky.clouds import GCP +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus +from sky.skylet import constants +from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus +from sky.utils import common_utils +from sky.utils import resources_utils +from sky.utils import subprocess_utils + + +# ---------- Test the image ---------- +@pytest.mark.aws +def test_aws_images(): + name = _get_cluster_name() + test = Test( + 'aws_images', + [ + f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true', + f'sky launch -y -c {name} examples/minimal.yaml', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_images(): + name = _get_cluster_name() + test = Test( + 'gcp_images', + [ + f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true', + f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_images(): + name = _get_cluster_name() + test = Test( + 'azure_images', + [ + f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true', + f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_aws_image_id_dict(): + name = _get_cluster_name() + test = Test( + 'aws_image_id_dict', + [ + # Use image id dict. + f'sky launch -y -c {name} examples/per_region_images.yaml', + f'sky exec {name} examples/per_region_images.yaml', + f'sky exec {name} "ls ~"', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_image_id_dict(): + name = _get_cluster_name() + test = Test( + 'gcp_image_id_dict', + [ + # Use image id dict. + f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} "ls ~"', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_aws_image_id_dict_region(): + name = _get_cluster_name() + test = Test( + 'aws_image_id_dict_region', + [ + # YAML has + # image_id: + # us-west-2: skypilot:gpu-ubuntu-1804 + # us-east-2: skypilot:gpu-ubuntu-2004 + # Use region to filter image_id dict. + f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true', + f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. + f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', + # Should success because the image id match for the region. + f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', + f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', + f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. + # Ensure exec works. + f'sky exec {name} --region us-east-2 examples/per_region_images.yaml', + f'sky exec {name} examples/per_region_images.yaml', + f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', + f'sky exec {name} "ls ~"', + f'sky logs {name} 4 --status', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_image_id_dict_region(): + name = _get_cluster_name() + test = Test( + 'gcp_image_id_dict_region', + [ + # Use region to filter image_id dict. + f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', + f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. + f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', + # Should success because the image id match for the region. + f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', + f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', + f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + f'sky status --all | grep {name} | grep us-west3', # Ensure the region is correct. + # Ensure exec works. + f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} --cloud gcp --region us-west3 "ls ~"', + f'sky exec {name} "ls ~"', + f'sky logs {name} 4 --status', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_aws_image_id_dict_zone(): + name = _get_cluster_name() + test = Test( + 'aws_image_id_dict_zone', + [ + # YAML has + # image_id: + # us-west-2: skypilot:gpu-ubuntu-1804 + # us-east-2: skypilot:gpu-ubuntu-2004 + # Use zone to filter image_id dict. + f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true', + f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. + f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml', + # Should success because the image id match for the zone. + f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', + f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', + # Fail due to image id mismatch. + f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + f'sky status --all | grep {name} | grep us-east-2a', # Ensure the zone is correct. + # Ensure exec works. + f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml', + f'sky exec {name} examples/per_region_images.yaml', + f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', + f'sky exec {name} "ls ~"', + f'sky logs {name} 4 --status', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_image_id_dict_zone(): + name = _get_cluster_name() + test = Test( + 'gcp_image_id_dict_zone', + [ + # Use zone to filter image_id dict. + f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', + f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. + f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', + # Should success because the image id match for the zone. + f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', + f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', + # Fail due to image id mismatch. + f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', + f'sky logs {name} 1 --status', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 --status', + f'sky status --all | grep {name} | grep us-central1', # Ensure the zone is correct. + # Ensure exec works. + f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', + f'sky exec {name} --cloud gcp --region us-central1 "ls ~"', + f'sky exec {name} "ls ~"', + f'sky logs {name} 4 --status', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_clone_disk_aws(): + name = _get_cluster_name() + test = Test( + 'clone_disk_aws', + [ + f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', + f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', + f'sky stop {name} -y', + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=60), + # Wait for EC2 instance to be in stopped state. + # TODO: event based wait. + 'sleep 60', + f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', + f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', + f'sky logs {name}-clone 1 --status', + f'sky logs {name}-clone-2 1 --status', + ], + f'sky down -y {name} {name}-clone {name}-clone-2', + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_clone_disk_gcp(): + name = _get_cluster_name() + test = Test( + 'clone_disk_gcp', + [ + f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"', + f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', + f'sky stop {name} -y', + f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"', + f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"', + f'sky logs {name}-clone 1 --status', + f'sky logs {name}-clone-2 1 --status', + ], + f'sky down -y {name} {name}-clone {name}-clone-2', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_mig(): + name = _get_cluster_name() + region = 'us-central1' + test = Test( + 'gcp_mig', + [ + f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 2 --status', + f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + # Check MIG exists. + f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', + f'sky autostop -i 0 --down -y {name}', + _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, + timeout=120), + f'gcloud compute instance-templates list | grep "sky-it-{name}"', + # Launch again with the same region. The original instance template + # should be removed. + f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi', + f'sky logs {name} 1 | grep "L4"', + f'sky down -y {name}', + f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true', + ], + f'sky down -y {name}', + env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'}) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_force_enable_external_ips(): + name = _get_cluster_name() + test_commands = [ + f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', + # Check network of vm is "default" + (f'gcloud compute instances list --filter=name~"{name}" --format=' + '"value(networkInterfaces.network)" | grep "networks/default"'), + # Check External NAT in network access configs, corresponds to external ip + (f'gcloud compute instances list --filter=name~"{name}" --format=' + '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"' + ), + f'sky down -y {name}', + ] + skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml' + test = Test('gcp_force_enable_external_ips', + test_commands, + f'sky down -y {name}', + env={'SKYPILOT_CONFIG': skypilot_config}) + run_one_test(test) + + +@pytest.mark.aws +def test_image_no_conda(): + name = _get_cluster_name() + test = Test( + 'image_no_conda', + [ + # Use image id dict. + f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', + f'sky logs {name} 1 --status', + f'sky stop {name} -y', + f'sky start {name} -y', + f'sky exec {name} examples/per_region_images.yaml', + f'sky logs {name} 2 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation +@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances +def test_custom_default_conda_env(generic_cloud: str): + name = _get_cluster_name() + test = Test('custom_default_conda_env', [ + f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky status -r {name} | grep "UP"', + f'sky logs {name} 1 --status', + f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"', + f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky logs {name} 2 --status', + f'sky autostop -y -i 0 {name}', + _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=80), + f'sky start -y {name}', + f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', + f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky logs {name} 3 --status', + ], f'sky down -y {name}') + run_one_test(test) diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py new file mode 100644 index 00000000000..57f84ff4a0e --- /dev/null +++ b/tests/smoke_tests/test_region_and_zone.py @@ -0,0 +1,267 @@ +# Smoke tests for SkyPilot +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/test_smoke.py +# +# Terminate failed clusters after test finishes +# > pytest tests/test_smoke.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/test_smoke.py::test_minimal +# +# Only run managed job tests +# > pytest tests/test_smoke.py --managed-jobs +# +# Only run sky serve tests +# > pytest tests/test_smoke.py --sky-serve +# +# Only run test for AWS + generic tests +# > pytest tests/test_smoke.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/test_smoke.py --generic-cloud aws + +import enum +import inspect +import json +import os +import pathlib +import shlex +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +from typing import Dict, List, NamedTuple, Optional, Tuple +import urllib.parse +import uuid + +import colorama +import jinja2 +import pytest +from smoke_tests.util import _get_cluster_name +from smoke_tests.util import ( + _get_cmd_wait_until_cluster_status_contains_wildcard) +from smoke_tests.util import _GET_JOB_QUEUE +from smoke_tests.util import _get_timeout +from smoke_tests.util import _JOB_WAIT_NOT_RUNNING +from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID +from smoke_tests.util import ( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) +from smoke_tests.util import ( + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) +from smoke_tests.util import FLUIDSTACK_TYPE +from smoke_tests.util import LAMBDA_TYPE +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_GPU_V100 +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test + +import sky +from sky import global_user_state +from sky import jobs +from sky import serve +from sky import skypilot_config +from sky.adaptors import azure +from sky.adaptors import cloudflare +from sky.adaptors import ibm +from sky.clouds import AWS +from sky.clouds import Azure +from sky.clouds import GCP +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus +from sky.skylet import constants +from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus +from sky.utils import common_utils +from sky.utils import resources_utils +from sky.utils import subprocess_utils + + +# ---------- Test region ---------- +@pytest.mark.aws +def test_aws_region(): + name = _get_cluster_name() + test = Test( + 'aws_region', + [ + f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml', + f'sky exec {name} examples/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_aws_with_ssh_proxy_command(): + name = _get_cluster_name() + + with tempfile.NamedTemporaryFile(mode='w') as f: + f.write( + textwrap.dedent(f"""\ + aws: + ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name} + """)) + f.flush() + test = Test( + 'aws_with_ssh_proxy_command', + [ + f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1', + # Use jump config + f'export SKYPILOT_CONFIG={f.name}; ' + f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi', + f'sky logs {name} 1 --status', + f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi', + f'sky logs {name} 2 --status', + # Start a small job to make sure the controller is created. + f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', + # Wait other tests to create the job controller first, so that + # the job controller is not launched with proxy command. + _get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard='sky-jobs-controller-*', + cluster_status=ClusterStatus.UP.value, + timeout=300), + f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. + format( + job_name=name, + job_status= + f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', + timeout=300), + ], + f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_region_and_service_account(): + name = _get_cluster_name() + test = Test( + 'gcp_region', + [ + f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml', + f'sky exec {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', + f'sky logs {name} 4 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.ibm +def test_ibm_region(): + name = _get_cluster_name() + region = 'eu-de' + test = Test( + 'region', + [ + f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml', + f'sky exec {name} --cloud ibm examples/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep {region}', # Ensure the region is correct. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_region(): + name = _get_cluster_name() + test = Test( + 'azure_region', + [ + f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml', + f'sky exec {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep eastus2', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + # A user program should not access SkyPilot runtime env python by default. + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', + f'sky logs {name} 4 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Test zone ---------- +@pytest.mark.aws +def test_aws_zone(): + name = _get_cluster_name() + test = Test( + 'aws_zone', + [ + f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b', + f'sky exec {name} examples/minimal.yaml --zone us-east-2b', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep us-east-2b', # Ensure the zone is correct. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.ibm +def test_ibm_zone(): + name = _get_cluster_name() + zone = 'eu-de-2' + test = Test( + 'zone', + [ + f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}', + f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep {zone}', # Ensure the zone is correct. + ], + f'sky down -y {name} {name}-2 {name}-3', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_zone(): + name = _get_cluster_name() + test = Test( + 'gcp_zone', + [ + f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', + f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status --all | grep {name} | grep us-central1-a', # Ensure the zone is correct. + ], + f'sky down -y {name}', + ) + run_one_test(test) diff --git a/tests/test_smoke.py b/tests/smoke_tests/test_smoke.py similarity index 83% rename from tests/test_smoke.py rename to tests/smoke_tests/test_smoke.py index a629816cb22..03132743c0e 100644 --- a/tests/test_smoke.py +++ b/tests/smoke_tests/test_smoke.py @@ -44,6 +44,27 @@ import colorama import jinja2 import pytest +from smoke_tests.util import _get_cluster_name +from smoke_tests.util import ( + _get_cmd_wait_until_cluster_status_contains_wildcard) +from smoke_tests.util import _GET_JOB_QUEUE +from smoke_tests.util import _get_timeout +from smoke_tests.util import _JOB_WAIT_NOT_RUNNING +from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND +from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID +from smoke_tests.util import ( + _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) +from smoke_tests.util import ( + _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) +from smoke_tests.util import FLUIDSTACK_TYPE +from smoke_tests.util import LAMBDA_TYPE +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_GPU_V100 +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test import sky from sky import global_user_state @@ -68,1020 +89,6 @@ from sky.utils import resources_utils from sky.utils import subprocess_utils -# To avoid the second smoke test reusing the cluster launched in the first -# smoke test. Also required for test_managed_jobs_recovery to make sure the -# manual termination with aws ec2 does not accidentally terminate other clusters -# for for the different managed jobs launch with the same job name but a -# different job id. -test_id = str(uuid.uuid4())[-2:] - -LAMBDA_TYPE = '--cloud lambda --gpus A10' -FLUIDSTACK_TYPE = '--cloud fluidstack --gpus RTXA4000' - -SCP_TYPE = '--cloud scp' -SCP_GPU_V100 = '--gpus V100-32GB' - -STORAGE_SETUP_COMMANDS = [ - 'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir', - 'touch ~/tmp-workdir/tmp\ file', 'touch ~/tmp-workdir/tmp\ file2', - 'touch ~/tmp-workdir/foo', - '[ ! -e ~/tmp-workdir/circle-link ] && ln -s ~/tmp-workdir/ ~/tmp-workdir/circle-link || true', - 'touch ~/.ssh/id_rsa.pub' -] - -# Get the job queue, and print it once on its own, then print it again to -# use with grep by the caller. -_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"' -# Wait for a job to be not in RUNNING state. Used to check for RECOVERING. -_JOB_WAIT_NOT_RUNNING = ( - 's=$(sky jobs queue);' - 'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do ' - 'sleep 10; s=$(sky jobs queue);' - 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') - -# Cluster functions -_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) -_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) -_ALL_MANAGED_JOB_STATUSES = "|".join( - [status.value for status in ManagedJobStatus]) - -_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( - # A while loop to wait until the cluster status - # becomes certain status, with timeout. - 'start_time=$SECONDS; ' - 'while true; do ' - 'if (( $SECONDS - $start_time > {timeout} )); then ' - ' echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; ' - 'fi; ' - 'current_status=$(sky status {cluster_name} --refresh | ' - 'awk "/^{cluster_name}/ ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES + - ')$/) print \$i}}"); ' - 'if [[ "$current_status" =~ {cluster_status} ]]; ' - 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' - 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' - 'sleep 10; ' - 'done') - - -def _get_cmd_wait_until_cluster_status_contains_wildcard( - cluster_name_wildcard: str, cluster_status: str, timeout: int): - wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( - 'sky status {cluster_name}', - 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', - 'awk "/^{cluster_name_awk}/') - return wait_cmd.format(cluster_name=cluster_name_wildcard, - cluster_name_awk=cluster_name_wildcard.replace( - '*', '.*'), - cluster_status=cluster_status, - timeout=timeout) - - -_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( - # A while loop to wait until the cluster is not found or timeout - 'start_time=$SECONDS; ' - 'while true; do ' - 'if (( $SECONDS - $start_time > {timeout} )); then ' - ' echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; ' - 'fi; ' - 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' - ' echo "Cluster {cluster_name} successfully removed."; break; ' - 'fi; ' - 'echo "Waiting for cluster {name} to be removed..."; ' - 'sleep 10; ' - 'done') - -_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( - # A while loop to wait until the job status - # contains certain status, with timeout. - 'start_time=$SECONDS; ' - 'while true; do ' - 'if (( $SECONDS - $start_time > {timeout} )); then ' - ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' - 'fi; ' - 'current_status=$(sky queue {cluster_name} | ' - 'awk "\\$1 == \\"{job_id}\\" ' - '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES + - ')$/) print \$i}}"); ' - 'found=0; ' # Initialize found variable outside the loop - 'while read -r line; do ' # Read line by line - ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line - ' echo "Target job status {job_status} reached."; ' - ' found=1; ' - ' break; ' # Break inner loop - ' fi; ' - 'done <<< "$current_status"; ' - 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found - 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' - 'sleep 10; ' - 'done') - -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_id}\\"', 'awk "') - -_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( - 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') - -# Managed job functions - -_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( - 'sky queue {cluster_name}', 'sky jobs queue').replace( - 'awk "\\$2 == \\"{job_name}\\"', - 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( - _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) - -# After the timeout, the cluster will stop if autostop is set, and our check -# should be more than the timeout. To address this, we extend the timeout by -# _BUMP_UP_SECONDS before exiting. -_BUMP_UP_SECONDS = 35 - -DEFAULT_CMD_TIMEOUT = 15 * 60 - - -class Test(NamedTuple): - name: str - # Each command is executed serially. If any failed, the remaining commands - # are not run and the test is treated as failed. - commands: List[str] - teardown: Optional[str] = None - # Timeout for each command in seconds. - timeout: int = DEFAULT_CMD_TIMEOUT - # Environment variables to set for each command. - env: Dict[str, str] = None - - def echo(self, message: str): - # pytest's xdist plugin captures stdout; print to stderr so that the - # logs are streaming while the tests are running. - prefix = f'[{self.name}]' - message = f'{prefix} {message}' - message = message.replace('\n', f'\n{prefix} ') - print(message, file=sys.stderr, flush=True) - - -def _get_timeout(generic_cloud: str, - override_timeout: int = DEFAULT_CMD_TIMEOUT): - timeouts = {'fluidstack': 60 * 60} # file_mounts - return timeouts.get(generic_cloud, override_timeout) - - -def _get_cluster_name() -> str: - """Returns a user-unique cluster name for each test_(). - - Must be called from each test_(). - """ - caller_func_name = inspect.stack()[1][3] - test_name = caller_func_name.replace('_', '-').replace('test-', 't-') - test_name = common_utils.make_cluster_name_on_cloud(test_name, - 24, - add_user_hash=False) - return f'{test_name}-{test_id}' - - -def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str: - cluster_name = serve.generate_replica_cluster_name(name, replica_id) - query_cmd = (f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:{cluster_name})" ' - f'--zones={zone} --format="value(name)"') - return (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - - -def run_one_test(test: Test) -> Tuple[int, str, str]: - # Fail fast if `sky` CLI somehow errors out. - subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True) - log_file = tempfile.NamedTemporaryFile('a', - prefix=f'{test.name}-', - suffix='.log', - delete=False) - test.echo(f'Test started. Log: less {log_file.name}') - env_dict = os.environ.copy() - if test.env: - env_dict.update(test.env) - for command in test.commands: - log_file.write(f'+ {command}\n') - log_file.flush() - proc = subprocess.Popen( - command, - stdout=log_file, - stderr=subprocess.STDOUT, - shell=True, - executable='/bin/bash', - env=env_dict, - ) - try: - proc.wait(timeout=test.timeout) - except subprocess.TimeoutExpired as e: - log_file.flush() - test.echo(f'Timeout after {test.timeout} seconds.') - test.echo(str(e)) - log_file.write(f'Timeout after {test.timeout} seconds.\n') - log_file.flush() - # Kill the current process. - proc.terminate() - proc.returncode = 1 # None if we don't set it. - break - - if proc.returncode: - break - - style = colorama.Style - fore = colorama.Fore - outcome = (f'{fore.RED}Failed{style.RESET_ALL}' - if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}') - reason = f'\nReason: {command}' if proc.returncode else '' - msg = (f'{outcome}.' - f'{reason}' - f'\nLog: less {log_file.name}\n') - test.echo(msg) - log_file.write(msg) - if (proc.returncode == 0 or - pytest.terminate_on_failure) and test.teardown is not None: - subprocess_utils.run( - test.teardown, - stdout=log_file, - stderr=subprocess.STDOUT, - timeout=10 * 60, # 10 mins - shell=True, - ) - - if proc.returncode: - raise Exception(f'test failed: less {log_file.name}') - - -def get_aws_region_for_quota_failover() -> Optional[str]: - candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge', - accelerators=None, - use_spot=True, - region=None, - zone=None) - original_resources = sky.Resources(cloud=sky.AWS(), - instance_type='p3.16xlarge', - use_spot=True) - - # Filter the regions with proxy command in ~/.sky/config.yaml. - filtered_regions = original_resources.get_valid_regions_for_launchable() - candidate_regions = [ - region for region in candidate_regions - if region.name in filtered_regions - ] - - for region in candidate_regions: - resources = original_resources.copy(region=region.name) - if not AWS.check_quota_available(resources): - return region.name - - return None - - -def get_gcp_region_for_quota_failover() -> Optional[str]: - - candidate_regions = GCP.regions_with_offering(instance_type=None, - accelerators={'A100-80GB': 1}, - use_spot=True, - region=None, - zone=None) - - original_resources = sky.Resources(cloud=sky.GCP(), - instance_type='a2-ultragpu-1g', - accelerators={'A100-80GB': 1}, - use_spot=True) - - # Filter the regions with proxy command in ~/.sky/config.yaml. - filtered_regions = original_resources.get_valid_regions_for_launchable() - candidate_regions = [ - region for region in candidate_regions - if region.name in filtered_regions - ] - - for region in candidate_regions: - if not GCP.check_quota_available( - original_resources.copy(region=region.name)): - return region.name - - return None - - -# ---------- Dry run: 2 Tasks in a chain. ---------- -@pytest.mark.no_fluidstack #requires GCP and AWS set up -def test_example_app(): - test = Test( - 'example_app', - ['python examples/example_app.py'], - ) - run_one_test(test) - - -_VALIDATE_LAUNCH_OUTPUT = ( - # Validate the output of the job submission: - # ⚙️ Launching on Kubernetes. - # Pod is up. - # ✓ Cluster launched: test. View logs at: ~/sky_logs/sky-2024-10-07-19-44-18-177288/provision.log - # ⚙️ Running setup on 1 pod. - # running setup - # ✓ Setup completed. - # ⚙️ Job submitted, ID: 1. - # ├── Waiting for task resources on 1 node. - # └── Job started. Streaming logs... (Ctrl-C to exit log streaming; job will not be killed) - # (min, pid=1277) # conda environments: - # (min, pid=1277) # - # (min, pid=1277) base * /opt/conda - # (min, pid=1277) - # (min, pid=1277) task run finish - # ✓ Job finished (status: SUCCEEDED). - # - # Job ID: 1 - # 📋 Useful Commands - # ├── To cancel the job: sky cancel test 1 - # ├── To stream job logs: sky logs test 1 - # └── To view job queue: sky queue test - # - # Cluster name: test - # ├── To log into the head VM: ssh test - # ├── To submit a job: sky exec test yaml_file - # ├── To stop the cluster: sky stop test - # └── To teardown the cluster: sky down test - 'echo "$s" && echo "==Validating launching==" && ' - 'echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' - 'echo "$s" && echo "==Validating setup output==" && ' - 'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' - 'echo "==Validating running output hints==" && echo "$s" | ' - 'grep -A 1 "Job submitted, ID:" | ' - 'grep "Waiting for task resources on " && ' - 'echo "==Validating task output starting==" && echo "$s" | ' - 'grep -A 1 "Job started. Streaming logs..." | grep "(min, pid=" && ' - 'echo "==Validating task output ending==" && ' - 'echo "$s" | grep -A 1 "task run finish" | ' - 'grep "Job finished (status: SUCCEEDED)" && ' - 'echo "==Validating task output ending 2==" && ' - 'echo "$s" | grep -A 5 "Job finished (status: SUCCEEDED)" | ' - 'grep "Job ID:" && ' - 'echo "$s" | grep -A 1 "Job ID:" | grep "Useful Commands"') - - -# ---------- A minimal task ---------- -def test_minimal(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'minimal', - [ - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', - # Output validation done. - f'sky logs {name} 1 --status', - f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. - # Test launch output again on existing cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - # Check the logs downloading - f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log', - # Ensure the raylet process has the correct file descriptor limit. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # Install jq for the next test. - f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', - # Check the cluster info - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', - f'sky logs {name} 5 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', - f'sky logs {name} 6 --status', # Ensure the job succeeded. - # Test '-c' for exec - f'sky exec -c {name} echo', - f'sky logs {name} 7 --status', - f'sky exec echo -c {name}', - f'sky logs {name} 8 --status', - f'sky exec -c {name} echo hi test', - f'sky logs {name} 9 | grep "hi test"', - f'sky exec {name} && exit 1 || true', - f'sky exec -c {name} && exit 1 || true', - ], - f'sky down -y {name}', - _get_timeout(generic_cloud), - ) - run_one_test(test) - - -# ---------- Test fast launch ---------- -def test_launch_fast(generic_cloud: str): - name = _get_cluster_name() - - test = Test( - 'test_launch_fast', - [ - # First launch to create the cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 1 --status', - - # Second launch to test fast launch - should not reprovision - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && ' - ' echo "$s" && ' - # Validate that cluster was not re-launched. - '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' - # Validate that setup was not re-run. - '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' - # Validate that the task ran and finished. - 'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - ], - f'sky down -y {name}', - timeout=_get_timeout(generic_cloud), - ) - run_one_test(test) - - -# See cloud exclusion explanations in test_autostop -@pytest.mark.no_fluidstack -@pytest.mark.no_lambda_cloud -@pytest.mark.no_ibm -@pytest.mark.no_kubernetes -def test_launch_fast_with_autostop(generic_cloud: str): - name = _get_cluster_name() - # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure - # the VM is stopped. - autostop_timeout = 600 if generic_cloud == 'azure' else 250 - test = Test( - 'test_launch_fast_with_autostop', - [ - # First launch to create the cluster with a short autostop - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 1 --status', - f'sky status -r {name} | grep UP', - - # Ensure cluster is stopped - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout), - - # Launch again. Do full output validation - we expect the cluster to re-launch - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - ], - f'sky down -y {name}', - timeout=_get_timeout(generic_cloud) + autostop_timeout, - ) - run_one_test(test) - - -# ---------- Test region ---------- -@pytest.mark.aws -def test_aws_region(): - name = _get_cluster_name() - test = Test( - 'aws_region', - [ - f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml', - f'sky exec {name} examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_with_ssh_proxy_command(): - name = _get_cluster_name() - - with tempfile.NamedTemporaryFile(mode='w') as f: - f.write( - textwrap.dedent(f"""\ - aws: - ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name} - """)) - f.flush() - test = Test( - 'aws_with_ssh_proxy_command', - [ - f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1', - # Use jump config - f'export SKYPILOT_CONFIG={f.name}; ' - f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi', - f'sky logs {name} 1 --status', - f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi', - f'sky logs {name} 2 --status', - # Start a small job to make sure the controller is created. - f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', - # Wait other tests to create the job controller first, so that - # the job controller is not launched with proxy command. - _get_cmd_wait_until_cluster_status_contains_wildcard( - cluster_name_wildcard='sky-jobs-controller-*', - cluster_status=ClusterStatus.UP.value, - timeout=300), - f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. - format( - job_name=name, - job_status= - f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', - timeout=300), - ], - f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_region_and_service_account(): - name = _get_cluster_name() - test = Test( - 'gcp_region', - [ - f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml', - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_region(): - name = _get_cluster_name() - region = 'eu-de' - test = Test( - 'region', - [ - f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml', - f'sky exec {name} --cloud ibm examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep {region}', # Ensure the region is correct. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_region(): - name = _get_cluster_name() - test = Test( - 'azure_region', - [ - f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml', - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep eastus2', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Test zone ---------- -@pytest.mark.aws -def test_aws_zone(): - name = _get_cluster_name() - test = Test( - 'aws_zone', - [ - f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b', - f'sky exec {name} examples/minimal.yaml --zone us-east-2b', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-east-2b', # Ensure the zone is correct. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_zone(): - name = _get_cluster_name() - zone = 'eu-de-2' - test = Test( - 'zone', - [ - f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}', - f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep {zone}', # Ensure the zone is correct. - ], - f'sky down -y {name} {name}-2 {name}-3', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_zone(): - name = _get_cluster_name() - test = Test( - 'gcp_zone', - [ - f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', - f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-central1-a', # Ensure the zone is correct. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Test the image ---------- -@pytest.mark.aws -def test_aws_images(): - name = _get_cluster_name() - test = Test( - 'aws_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} examples/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_images(): - name = _get_cluster_name() - test = Test( - 'gcp_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_images(): - name = _get_cluster_name() - test = Test( - 'azure_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict(): - name = _get_cluster_name() - test = Test( - 'aws_image_id_dict', - [ - # Use image id dict. - f'sky launch -y -c {name} examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} "ls ~"', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict(): - name = _get_cluster_name() - test = Test( - 'gcp_image_id_dict', - [ - # Use image id dict. - f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} "ls ~"', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict_region(): - name = _get_cluster_name() - test = Test( - 'aws_image_id_dict_region', - [ - # YAML has - # image_id: - # us-west-2: skypilot:gpu-ubuntu-1804 - # us-east-2: skypilot:gpu-ubuntu-2004 - # Use region to filter image_id dict. - f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', - # Should success because the image id match for the region. - f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. - # Ensure exec works. - f'sky exec {name} --region us-east-2 examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict_region(): - name = _get_cluster_name() - test = Test( - 'gcp_image_id_dict_region', - [ - # Use region to filter image_id dict. - f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', - # Should success because the image id match for the region. - f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-west3', # Ensure the region is correct. - # Ensure exec works. - f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} --cloud gcp --region us-west3 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict_zone(): - name = _get_cluster_name() - test = Test( - 'aws_image_id_dict_zone', - [ - # YAML has - # image_id: - # us-west-2: skypilot:gpu-ubuntu-1804 - # us-east-2: skypilot:gpu-ubuntu-2004 - # Use zone to filter image_id dict. - f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml', - # Should success because the image id match for the zone. - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - # Fail due to image id mismatch. - f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-east-2a', # Ensure the zone is correct. - # Ensure exec works. - f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict_zone(): - name = _get_cluster_name() - test = Test( - 'gcp_image_id_dict_zone', - [ - # Use zone to filter image_id dict. - f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', - # Should success because the image id match for the zone. - f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', - # Fail due to image id mismatch. - f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-central1', # Ensure the zone is correct. - # Ensure exec works. - f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} --cloud gcp --region us-central1 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_clone_disk_aws(): - name = _get_cluster_name() - test = Test( - 'clone_disk_aws', - [ - f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', - f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=60), - # Wait for EC2 instance to be in stopped state. - # TODO: event based wait. - 'sleep 60', - f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', - f'sky logs {name}-clone 1 --status', - f'sky logs {name}-clone-2 1 --status', - ], - f'sky down -y {name} {name}-clone {name}-clone-2', - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_clone_disk_gcp(): - name = _get_cluster_name() - test = Test( - 'clone_disk_gcp', - [ - f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', - f'sky stop {name} -y', - f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"', - f'sky logs {name}-clone 1 --status', - f'sky logs {name}-clone-2 1 --status', - ], - f'sky down -y {name} {name}-clone {name}-clone-2', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_mig(): - name = _get_cluster_name() - region = 'us-central1' - test = Test( - 'gcp_mig', - [ - f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - # Check MIG exists. - f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', - f'sky autostop -i 0 --down -y {name}', - _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, - timeout=120), - f'gcloud compute instance-templates list | grep "sky-it-{name}"', - # Launch again with the same region. The original instance template - # should be removed. - f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi', - f'sky logs {name} 1 | grep "L4"', - f'sky down -y {name}', - f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true', - ], - f'sky down -y {name}', - env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'}) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_force_enable_external_ips(): - name = _get_cluster_name() - test_commands = [ - f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', - # Check network of vm is "default" - (f'gcloud compute instances list --filter=name~"{name}" --format=' - '"value(networkInterfaces.network)" | grep "networks/default"'), - # Check External NAT in network access configs, corresponds to external ip - (f'gcloud compute instances list --filter=name~"{name}" --format=' - '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"' - ), - f'sky down -y {name}', - ] - skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml' - test = Test('gcp_force_enable_external_ips', - test_commands, - f'sky down -y {name}', - env={'SKYPILOT_CONFIG': skypilot_config}) - run_one_test(test) - - -@pytest.mark.aws -def test_image_no_conda(): - name = _get_cluster_name() - test = Test( - 'image_no_conda', - [ - # Use image id dict. - f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', - f'sky logs {name} 1 --status', - f'sky stop {name} -y', - f'sky start {name} -y', - f'sky exec {name} examples/per_region_images.yaml', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation -@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances -def test_custom_default_conda_env(generic_cloud: str): - name = _get_cluster_name() - test = Test('custom_default_conda_env', [ - f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky status -r {name} | grep "UP"', - f'sky logs {name} 1 --status', - f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"', - f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky logs {name} 2 --status', - f'sky autostop -y -i 0 {name}', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=80), - f'sky start -y {name}', - f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', - f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky logs {name} 3 --status', - ], f'sky down -y {name}') - run_one_test(test) - # ------------ Test stale job ------------ @pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py new file mode 100644 index 00000000000..ebd71e9a10e --- /dev/null +++ b/tests/smoke_tests/util.py @@ -0,0 +1,381 @@ +import enum +import inspect +import json +import os +import pathlib +import shlex +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +from typing import Dict, List, NamedTuple, Optional, Tuple +import urllib.parse +import uuid + +import colorama +import jinja2 +import pytest + +import sky +from sky import global_user_state +from sky import jobs +from sky import serve +from sky import skypilot_config +from sky.adaptors import azure +from sky.adaptors import cloudflare +from sky.adaptors import ibm +from sky.clouds import AWS +from sky.clouds import Azure +from sky.clouds import GCP +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.data.data_utils import Rclone +from sky.jobs.state import ManagedJobStatus +from sky.skylet import constants +from sky.skylet import events +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus +from sky.utils import common_utils +from sky.utils import resources_utils +from sky.utils import subprocess_utils + +# To avoid the second smoke test reusing the cluster launched in the first +# smoke test. Also required for test_managed_jobs_recovery to make sure the +# manual termination with aws ec2 does not accidentally terminate other clusters +# for for the different managed jobs launch with the same job name but a +# different job id. +test_id = str(uuid.uuid4())[-2:] + +LAMBDA_TYPE = '--cloud lambda --gpus A10' +FLUIDSTACK_TYPE = '--cloud fluidstack --gpus RTXA4000' + +SCP_TYPE = '--cloud scp' +SCP_GPU_V100 = '--gpus V100-32GB' + +STORAGE_SETUP_COMMANDS = [ + 'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir', + 'touch ~/tmp-workdir/tmp\ file', 'touch ~/tmp-workdir/tmp\ file2', + 'touch ~/tmp-workdir/foo', + '[ ! -e ~/tmp-workdir/circle-link ] && ln -s ~/tmp-workdir/ ~/tmp-workdir/circle-link || true', + 'touch ~/.ssh/id_rsa.pub' +] + +# Get the job queue, and print it once on its own, then print it again to +# use with grep by the caller. +_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"' +# Wait for a job to be not in RUNNING state. Used to check for RECOVERING. +_JOB_WAIT_NOT_RUNNING = ( + 's=$(sky jobs queue);' + 'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do ' + 'sleep 10; s=$(sky jobs queue);' + 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') + +# Cluster functions +_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus]) +_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus]) +_ALL_MANAGED_JOB_STATUSES = "|".join( + [status.value for status in ManagedJobStatus]) + +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( + # A while loop to wait until the cluster status + # becomes certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky status {cluster_name} --refresh | ' + 'awk "/^{cluster_name}/ ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES + + ')$/) print \$i}}"); ' + 'if [[ "$current_status" =~ {cluster_status} ]]; ' + 'then echo "Target cluster status {cluster_status} reached."; break; fi; ' + 'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; ' + 'sleep 10; ' + 'done') + + +def _get_cmd_wait_until_cluster_status_contains_wildcard( + cluster_name_wildcard: str, cluster_status: str, timeout: int): + wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( + 'sky status {cluster_name}', + 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', + 'awk "/^{cluster_name_awk}/') + return wait_cmd.format(cluster_name=cluster_name_wildcard, + cluster_name_awk=cluster_name_wildcard.replace( + '*', '.*'), + cluster_status=cluster_status, + timeout=timeout) + + +_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( + # A while loop to wait until the cluster is not found or timeout + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; ' + 'fi; ' + 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' + ' echo "Cluster {cluster_name} successfully removed."; break; ' + 'fi; ' + 'echo "Waiting for cluster {name} to be removed..."; ' + 'sleep 10; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( + # A while loop to wait until the job status + # contains certain status, with timeout. + 'start_time=$SECONDS; ' + 'while true; do ' + 'if (( $SECONDS - $start_time > {timeout} )); then ' + ' echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; ' + 'fi; ' + 'current_status=$(sky queue {cluster_name} | ' + 'awk "\\$1 == \\"{job_id}\\" ' + '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES + + ')$/) print \$i}}"); ' + 'found=0; ' # Initialize found variable outside the loop + 'while read -r line; do ' # Read line by line + ' if [[ "$line" =~ {job_status} ]]; then ' # Check each line + ' echo "Target job status {job_status} reached."; ' + ' found=1; ' + ' break; ' # Break inner loop + ' fi; ' + 'done <<< "$current_status"; ' + 'if [ "$found" -eq 1 ]; then break; fi; ' # Break outer loop if match found + 'echo "Waiting for job status to contains {job_status}, current status: $current_status"; ' + 'sleep 10; ' + 'done') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_id}\\"', 'awk "') + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( + 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') + +# Managed job functions + +_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( + 'sky queue {cluster_name}', 'sky jobs queue').replace( + 'awk "\\$2 == \\"{job_name}\\"', + 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( + _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) + +# After the timeout, the cluster will stop if autostop is set, and our check +# should be more than the timeout. To address this, we extend the timeout by +# _BUMP_UP_SECONDS before exiting. +_BUMP_UP_SECONDS = 35 + +DEFAULT_CMD_TIMEOUT = 15 * 60 + + +class Test(NamedTuple): + name: str + # Each command is executed serially. If any failed, the remaining commands + # are not run and the test is treated as failed. + commands: List[str] + teardown: Optional[str] = None + # Timeout for each command in seconds. + timeout: int = DEFAULT_CMD_TIMEOUT + # Environment variables to set for each command. + env: Dict[str, str] = None + + def echo(self, message: str): + # pytest's xdist plugin captures stdout; print to stderr so that the + # logs are streaming while the tests are running. + prefix = f'[{self.name}]' + message = f'{prefix} {message}' + message = message.replace('\n', f'\n{prefix} ') + print(message, file=sys.stderr, flush=True) + + +def _get_timeout(generic_cloud: str, + override_timeout: int = DEFAULT_CMD_TIMEOUT): + timeouts = {'fluidstack': 60 * 60} # file_mounts + return timeouts.get(generic_cloud, override_timeout) + + +def _get_cluster_name() -> str: + """Returns a user-unique cluster name for each test_(). + + Must be called from each test_(). + """ + caller_func_name = inspect.stack()[1][3] + test_name = caller_func_name.replace('_', '-').replace('test-', 't-') + test_name = common_utils.make_cluster_name_on_cloud(test_name, + 24, + add_user_hash=False) + return f'{test_name}-{test_id}' + + +def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str: + cluster_name = serve.generate_replica_cluster_name(name, replica_id) + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{cluster_name})" ' + f'--zones={zone} --format="value(name)"') + return (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + + +def run_one_test(test: Test) -> Tuple[int, str, str]: + # Fail fast if `sky` CLI somehow errors out. + subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True) + log_file = tempfile.NamedTemporaryFile('a', + prefix=f'{test.name}-', + suffix='.log', + delete=False) + test.echo(f'Test started. Log: less {log_file.name}') + env_dict = os.environ.copy() + if test.env: + env_dict.update(test.env) + for command in test.commands: + log_file.write(f'+ {command}\n') + log_file.flush() + proc = subprocess.Popen( + command, + stdout=log_file, + stderr=subprocess.STDOUT, + shell=True, + executable='/bin/bash', + env=env_dict, + ) + try: + proc.wait(timeout=test.timeout) + except subprocess.TimeoutExpired as e: + log_file.flush() + test.echo(f'Timeout after {test.timeout} seconds.') + test.echo(str(e)) + log_file.write(f'Timeout after {test.timeout} seconds.\n') + log_file.flush() + # Kill the current process. + proc.terminate() + proc.returncode = 1 # None if we don't set it. + break + + if proc.returncode: + break + + style = colorama.Style + fore = colorama.Fore + outcome = (f'{fore.RED}Failed{style.RESET_ALL}' + if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}') + reason = f'\nReason: {command}' if proc.returncode else '' + msg = (f'{outcome}.' + f'{reason}' + f'\nLog: less {log_file.name}\n') + test.echo(msg) + log_file.write(msg) + if (proc.returncode == 0 or + pytest.terminate_on_failure) and test.teardown is not None: + subprocess_utils.run( + test.teardown, + stdout=log_file, + stderr=subprocess.STDOUT, + timeout=10 * 60, # 10 mins + shell=True, + ) + + if proc.returncode: + raise Exception(f'test failed: less {log_file.name}') + + +def get_aws_region_for_quota_failover() -> Optional[str]: + candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge', + accelerators=None, + use_spot=True, + region=None, + zone=None) + original_resources = sky.Resources(cloud=sky.AWS(), + instance_type='p3.16xlarge', + use_spot=True) + + # Filter the regions with proxy command in ~/.sky/config.yaml. + filtered_regions = original_resources.get_valid_regions_for_launchable() + candidate_regions = [ + region for region in candidate_regions + if region.name in filtered_regions + ] + + for region in candidate_regions: + resources = original_resources.copy(region=region.name) + if not AWS.check_quota_available(resources): + return region.name + + return None + + +def get_gcp_region_for_quota_failover() -> Optional[str]: + + candidate_regions = GCP.regions_with_offering(instance_type=None, + accelerators={'A100-80GB': 1}, + use_spot=True, + region=None, + zone=None) + + original_resources = sky.Resources(cloud=sky.GCP(), + instance_type='a2-ultragpu-1g', + accelerators={'A100-80GB': 1}, + use_spot=True) + + # Filter the regions with proxy command in ~/.sky/config.yaml. + filtered_regions = original_resources.get_valid_regions_for_launchable() + candidate_regions = [ + region for region in candidate_regions + if region.name in filtered_regions + ] + + for region in candidate_regions: + if not GCP.check_quota_available( + original_resources.copy(region=region.name)): + return region.name + + return None + + +_VALIDATE_LAUNCH_OUTPUT = ( + # Validate the output of the job submission: + # ⚙️ Launching on Kubernetes. + # Pod is up. + # ✓ Cluster launched: test. View logs at: ~/sky_logs/sky-2024-10-07-19-44-18-177288/provision.log + # ⚙️ Running setup on 1 pod. + # running setup + # ✓ Setup completed. + # ⚙️ Job submitted, ID: 1. + # ├── Waiting for task resources on 1 node. + # └── Job started. Streaming logs... (Ctrl-C to exit log streaming; job will not be killed) + # (min, pid=1277) # conda environments: + # (min, pid=1277) # + # (min, pid=1277) base * /opt/conda + # (min, pid=1277) + # (min, pid=1277) task run finish + # ✓ Job finished (status: SUCCEEDED). + # + # Job ID: 1 + # 📋 Useful Commands + # ├── To cancel the job: sky cancel test 1 + # ├── To stream job logs: sky logs test 1 + # └── To view job queue: sky queue test + # + # Cluster name: test + # ├── To log into the head VM: ssh test + # ├── To submit a job: sky exec test yaml_file + # ├── To stop the cluster: sky stop test + # └── To teardown the cluster: sky down test + 'echo "$s" && echo "==Validating launching==" && ' + 'echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' + 'echo "$s" && echo "==Validating setup output==" && ' + 'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' + 'echo "==Validating running output hints==" && echo "$s" | ' + 'grep -A 1 "Job submitted, ID:" | ' + 'grep "Waiting for task resources on " && ' + 'echo "==Validating task output starting==" && echo "$s" | ' + 'grep -A 1 "Job started. Streaming logs..." | grep "(min, pid=" && ' + 'echo "==Validating task output ending==" && ' + 'echo "$s" | grep -A 1 "task run finish" | ' + 'grep "Job finished (status: SUCCEEDED)" && ' + 'echo "==Validating task output ending 2==" && ' + 'echo "$s" | grep -A 5 "Job finished (status: SUCCEEDED)" | ' + 'grep "Job ID:" && ' + 'echo "$s" | grep -A 1 "Job ID:" | grep "Useful Commands"') From 9abd4d43e0c2d13c99f57ea4615087d064629cfe Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 17:01:04 +0800 Subject: [PATCH 20/64] fix import --- tests/smoke_tests/test_basic.py | 54 ---------------------- tests/smoke_tests/test_images.py | 55 ----------------------- tests/smoke_tests/test_region_and_zone.py | 33 -------------- tests/smoke_tests/test_smoke.py | 14 +++--- 4 files changed, 6 insertions(+), 150 deletions(-) diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index 9d8a1225e42..c0996e135d0 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -25,69 +25,15 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws -import enum -import inspect -import json -import os -import pathlib -import shlex -import shutil -import subprocess -import sys -import tempfile -import textwrap -import time -from typing import Dict, List, NamedTuple, Optional, Tuple -import urllib.parse -import uuid - -import colorama -import jinja2 import pytest from smoke_tests.util import _get_cluster_name -from smoke_tests.util import ( - _get_cmd_wait_until_cluster_status_contains_wildcard) -from smoke_tests.util import _GET_JOB_QUEUE from smoke_tests.util import _get_timeout -from smoke_tests.util import _JOB_WAIT_NOT_RUNNING from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID -from smoke_tests.util import ( - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) -from smoke_tests.util import ( - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) -from smoke_tests.util import FLUIDSTACK_TYPE -from smoke_tests.util import LAMBDA_TYPE from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_GPU_V100 -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test -import sky -from sky import global_user_state -from sky import jobs -from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm -from sky.clouds import AWS -from sky.clouds import Azure -from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone -from sky.jobs.state import ManagedJobStatus -from sky.skylet import constants -from sky.skylet import events -from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus -from sky.utils import common_utils -from sky.utils import resources_utils -from sky.utils import subprocess_utils # ---------- Dry run: 2 Tasks in a chain. ---------- diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py index 42438461f76..96ce2f59c0c 100644 --- a/tests/smoke_tests/test_images.py +++ b/tests/smoke_tests/test_images.py @@ -25,69 +25,14 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws -import enum -import inspect -import json -import os -import pathlib -import shlex -import shutil -import subprocess -import sys -import tempfile -import textwrap -import time -from typing import Dict, List, NamedTuple, Optional, Tuple -import urllib.parse -import uuid - -import colorama -import jinja2 import pytest from smoke_tests.util import _get_cluster_name -from smoke_tests.util import ( - _get_cmd_wait_until_cluster_status_contains_wildcard) -from smoke_tests.util import _GET_JOB_QUEUE -from smoke_tests.util import _get_timeout -from smoke_tests.util import _JOB_WAIT_NOT_RUNNING -from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID -from smoke_tests.util import ( - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) -from smoke_tests.util import ( - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) -from smoke_tests.util import FLUIDSTACK_TYPE -from smoke_tests.util import LAMBDA_TYPE from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_GPU_V100 -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test -import sky -from sky import global_user_state -from sky import jobs -from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm -from sky.clouds import AWS -from sky.clouds import Azure -from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone -from sky.jobs.state import ManagedJobStatus -from sky.skylet import constants -from sky.skylet import events -from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus -from sky.utils import common_utils -from sky.utils import resources_utils -from sky.utils import subprocess_utils # ---------- Test the image ---------- diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py index 57f84ff4a0e..0fc7ce409fc 100644 --- a/tests/smoke_tests/test_region_and_zone.py +++ b/tests/smoke_tests/test_region_and_zone.py @@ -47,47 +47,14 @@ from smoke_tests.util import _get_cluster_name from smoke_tests.util import ( _get_cmd_wait_until_cluster_status_contains_wildcard) -from smoke_tests.util import _GET_JOB_QUEUE -from smoke_tests.util import _get_timeout -from smoke_tests.util import _JOB_WAIT_NOT_RUNNING -from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID -from smoke_tests.util import ( - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) from smoke_tests.util import ( _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) -from smoke_tests.util import FLUIDSTACK_TYPE -from smoke_tests.util import LAMBDA_TYPE from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_GPU_V100 -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test -import sky -from sky import global_user_state -from sky import jobs -from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm -from sky.clouds import AWS -from sky.clouds import Azure -from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone from sky.jobs.state import ManagedJobStatus from sky.skylet import constants -from sky.skylet import events -from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus -from sky.utils import common_utils -from sky.utils import resources_utils -from sky.utils import subprocess_utils # ---------- Test region ---------- diff --git a/tests/smoke_tests/test_smoke.py b/tests/smoke_tests/test_smoke.py index 03132743c0e..348c880d7a7 100644 --- a/tests/smoke_tests/test_smoke.py +++ b/tests/smoke_tests/test_smoke.py @@ -25,7 +25,6 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws -import enum import inspect import json import os @@ -37,34 +36,33 @@ import tempfile import textwrap import time -from typing import Dict, List, NamedTuple, Optional, Tuple +from typing import Dict, List, Optional, Tuple import urllib.parse import uuid -import colorama import jinja2 import pytest +from smoke_tests.util import _BUMP_UP_SECONDS from smoke_tests.util import _get_cluster_name -from smoke_tests.util import ( - _get_cmd_wait_until_cluster_status_contains_wildcard) from smoke_tests.util import _GET_JOB_QUEUE from smoke_tests.util import _get_timeout from smoke_tests.util import _JOB_WAIT_NOT_RUNNING -from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND +from smoke_tests.util import _terminate_gcp_replica from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID from smoke_tests.util import ( _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) from smoke_tests.util import ( _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) -from smoke_tests.util import FLUIDSTACK_TYPE +from smoke_tests.util import get_aws_region_for_quota_failover +from smoke_tests.util import get_gcp_region_for_quota_failover from smoke_tests.util import LAMBDA_TYPE from smoke_tests.util import run_one_test from smoke_tests.util import SCP_GPU_V100 from smoke_tests.util import SCP_TYPE from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test +from smoke_tests.util import test_id import sky from sky import global_user_state From e0a4c9fdf98a711be66f7921778e74f41af127fe Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 17:56:44 +0800 Subject: [PATCH 21/64] buildkite config --- .buildkite/pipeline.yaml | 5 ++++ tests/smoke_tests/util.py | 52 ++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 17 deletions(-) create mode 100644 .buildkite/pipeline.yaml diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml new file mode 100644 index 00000000000..21efc41de1d --- /dev/null +++ b/.buildkite/pipeline.yaml @@ -0,0 +1,5 @@ +steps: + - label: "smoke test -> test_minimal" + command: "pytest tests/smoke_tests/test_basic.py::test_minimal" + env: + LOG_TO_STDOUT: "1" diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index ebd71e9a10e..c413bc6f2be 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -222,20 +222,31 @@ def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str: def run_one_test(test: Test) -> Tuple[int, str, str]: # Fail fast if `sky` CLI somehow errors out. subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True) - log_file = tempfile.NamedTemporaryFile('a', - prefix=f'{test.name}-', - suffix='.log', - delete=False) - test.echo(f'Test started. Log: less {log_file.name}') + log_to_stdout = os.environ.get('LOG_TO_STDOUT', None) + if log_to_stdout: + write = test.echo + flush = lambda: None + out = sys.stdout + test.echo(f'Test started. Log to stdout') + else: + log_file = tempfile.NamedTemporaryFile('a', + prefix=f'{test.name}-', + suffix='.log', + delete=False) + write = log_file.write + flush = log_file.flush + out = log_file + test.echo(f'Test started. Log: less {log_file.name}') + env_dict = os.environ.copy() if test.env: env_dict.update(test.env) for command in test.commands: - log_file.write(f'+ {command}\n') - log_file.flush() + write(f'+ {command}\n') + flush() proc = subprocess.Popen( command, - stdout=log_file, + stdout=out, stderr=subprocess.STDOUT, shell=True, executable='/bin/bash', @@ -244,11 +255,11 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: try: proc.wait(timeout=test.timeout) except subprocess.TimeoutExpired as e: - log_file.flush() + flush() test.echo(f'Timeout after {test.timeout} seconds.') test.echo(str(e)) - log_file.write(f'Timeout after {test.timeout} seconds.\n') - log_file.flush() + write(f'Timeout after {test.timeout} seconds.\n') + flush() # Kill the current process. proc.terminate() proc.returncode = 1 # None if we don't set it. @@ -263,22 +274,29 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}') reason = f'\nReason: {command}' if proc.returncode else '' msg = (f'{outcome}.' - f'{reason}' - f'\nLog: less {log_file.name}\n') - test.echo(msg) - log_file.write(msg) + f'{reason}') + if log_to_stdout: + test.echo(msg) + else: + msg += f'\nLog: less {log_file.name}\n' + test.echo(msg) + write(msg) + if (proc.returncode == 0 or pytest.terminate_on_failure) and test.teardown is not None: subprocess_utils.run( test.teardown, - stdout=log_file, + stdout=out, stderr=subprocess.STDOUT, timeout=10 * 60, # 10 mins shell=True, ) if proc.returncode: - raise Exception(f'test failed: less {log_file.name}') + if log_to_stdout: + raise Exception(f'test failed') + else: + raise Exception(f'test failed: less {log_file.name}') def get_aws_region_for_quota_failover() -> Optional[str]: From 58090a340584ee810ded86b7db6a1c2ab22e873f Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 21 Nov 2024 18:09:08 +0800 Subject: [PATCH 22/64] fix stdout problem --- tests/smoke_tests/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index c413bc6f2be..322c19a266e 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -226,7 +226,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: if log_to_stdout: write = test.echo flush = lambda: None - out = sys.stdout + subprocess_out = sys.stderr test.echo(f'Test started. Log to stdout') else: log_file = tempfile.NamedTemporaryFile('a', @@ -235,7 +235,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: delete=False) write = log_file.write flush = log_file.flush - out = log_file + subprocess_out = log_file test.echo(f'Test started. Log: less {log_file.name}') env_dict = os.environ.copy() @@ -246,7 +246,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: flush() proc = subprocess.Popen( command, - stdout=out, + stdout=subprocess_out, stderr=subprocess.STDOUT, shell=True, executable='/bin/bash', @@ -286,7 +286,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: pytest.terminate_on_failure) and test.teardown is not None: subprocess_utils.run( test.teardown, - stdout=out, + stdout=subprocess_out, stderr=subprocess.STDOUT, timeout=10 * 60, # 10 mins shell=True, From 88b396f11793676204f679d19164cadc9b105fac Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 22 Nov 2024 11:02:29 +0800 Subject: [PATCH 23/64] update pipeline test --- .buildkite/pipeline.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 21efc41de1d..679b463580e 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -1,5 +1,10 @@ steps: - - label: "smoke test -> test_minimal" + - label: "test_minimal" command: "pytest tests/smoke_tests/test_basic.py::test_minimal" env: LOG_TO_STDOUT: "1" + + - label: "test_aws_stale_job_manual_restart" + command: "pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart" + env: + LOG_TO_STDOUT: "1" From 9405b449fc2d13e45d20eacd3a97d7c5eca408b7 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 22 Nov 2024 11:31:01 +0800 Subject: [PATCH 24/64] test again --- .buildkite/pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 679b463580e..4d8ed6ff8f0 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -4,7 +4,7 @@ steps: env: LOG_TO_STDOUT: "1" - - label: "test_aws_stale_job_manual_restart" - command: "pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart" + - label: "test_launch_fast" + command: "pytest tests/smoke_tests/test_basic.py::test_launch_fast" env: LOG_TO_STDOUT: "1" From 5a2409f2e2cdc4f588d5aa3a482f65a936e05c50 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 22 Nov 2024 15:34:26 +0800 Subject: [PATCH 25/64] smoke test for buildkite --- .buildkite/generate_pipeline.py | 107 + .buildkite/pipeline.yaml | 10 - .buildkite/pipeline_smoke_test_basic.yaml | 91 + .../pipeline_smoke_test_cluster_job.yaml | 265 + .buildkite/pipeline_smoke_test_images.yaml | 66 + .../pipeline_smoke_test_managed_job.yaml | 79 + ...pipeline_smoke_test_mount_and_storage.yaml | 139 + .../pipeline_smoke_test_region_and_zone.yaml | 36 + ...line_smoke_test_required_before_merge.yaml | 7 + .buildkite/pipeline_smoke_test_sky_serve.yaml | 106 + tests/smoke_tests/test_basic.py | 511 +- tests/smoke_tests/test_cluster_job.py | 1657 ++++++ tests/smoke_tests/test_images.py | 50 +- tests/smoke_tests/test_managed_job.py | 766 +++ tests/smoke_tests/test_mount_and_storage.py | 1503 +++++ tests/smoke_tests/test_region_and_zone.py | 65 +- .../smoke_tests/test_required_before_merge.py | 46 + tests/smoke_tests/test_sky_serve.py | 795 +++ tests/smoke_tests/test_smoke.py | 5077 ----------------- tests/smoke_tests/util.py | 54 +- tests/test_smoke.py | 36 + .../minimal_test_required_before_merge.yaml | 13 + 22 files changed, 6254 insertions(+), 5225 deletions(-) create mode 100644 .buildkite/generate_pipeline.py delete mode 100644 .buildkite/pipeline.yaml create mode 100644 .buildkite/pipeline_smoke_test_basic.yaml create mode 100644 .buildkite/pipeline_smoke_test_cluster_job.yaml create mode 100644 .buildkite/pipeline_smoke_test_images.yaml create mode 100644 .buildkite/pipeline_smoke_test_managed_job.yaml create mode 100644 .buildkite/pipeline_smoke_test_mount_and_storage.yaml create mode 100644 .buildkite/pipeline_smoke_test_region_and_zone.yaml create mode 100644 .buildkite/pipeline_smoke_test_required_before_merge.yaml create mode 100644 .buildkite/pipeline_smoke_test_sky_serve.yaml create mode 100644 tests/smoke_tests/test_cluster_job.py create mode 100644 tests/smoke_tests/test_managed_job.py create mode 100644 tests/smoke_tests/test_mount_and_storage.py create mode 100644 tests/smoke_tests/test_required_before_merge.py create mode 100644 tests/smoke_tests/test_sky_serve.py delete mode 100644 tests/smoke_tests/test_smoke.py create mode 100644 tests/test_smoke.py create mode 100644 tests/test_yamls/minimal_test_required_before_merge.yaml diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py new file mode 100644 index 00000000000..b363c695057 --- /dev/null +++ b/.buildkite/generate_pipeline.py @@ -0,0 +1,107 @@ +"""This script generates a Buildkite pipeline from test files.""" +import ast +import copy +import os +from typing import Any, Dict, List + +import yaml + +DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] + + +def _get_full_decorator_path(decorator: ast.AST) -> str: + """Recursively get the full path of a decorator.""" + if isinstance(decorator, ast.Attribute): + return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}' + elif isinstance(decorator, ast.Name): + return decorator.id + elif isinstance(decorator, ast.Call): + return _get_full_decorator_path(decorator.func) + raise ValueError(f'Unknown decorator type: {type(decorator)}') + + +def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: + """Extract test functions and filter clouds with pytest.mark + from a Python test file.""" + with open(file_path, 'r', encoding='utf-8') as file: + tree = ast.parse(file.read(), filename=file_path) + + for node in ast.walk(tree): + for child in ast.iter_child_nodes(node): + setattr(child, 'parent', node) + + function_cloud_map = {} + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'): + class_name = None + if hasattr(node, 'parent') and isinstance(node.parent, + ast.ClassDef): + class_name = node.parent.name + + clouds_to_include = [] + clouds_to_exclude = [] + for decorator in node.decorator_list: + if isinstance(decorator, ast.Call): + # We only need to consider the decorator with no arguments + # to extract clouds. + continue + full_path = _get_full_decorator_path(decorator) + if full_path.startswith('pytest.mark.'): + assert isinstance(decorator, ast.Attribute) + suffix = decorator.attr + if suffix.startswith('no_'): + clouds_to_exclude.append(suffix[3:]) + else: + clouds_to_include.append(suffix) + clouds_to_include = (clouds_to_include if clouds_to_include else + copy.deepcopy(DEFAULT_CLOUDS_TO_RUN)) + clouds_to_include = [ + cloud for cloud in clouds_to_include + if cloud not in clouds_to_exclude + ] + function_name = (f'{class_name}::{node.name}' + if class_name else node.name) + function_cloud_map[function_name] = (clouds_to_include) + return function_cloud_map + + +def _generate_pipeline(test_file: str) -> Dict[str, Any]: + """Generate a Buildkite pipeline from test files.""" + steps = [] + function_cloud_map = _extract_marked_tests(test_file) + for test_function, clouds in function_cloud_map.items(): + for cloud in clouds: + step = { + 'label': f'{test_function} on {cloud}', + 'command': f'pytest {test_file}::{test_function} --{cloud}', + 'env': { + 'LOG_TO_STDOUT': '1' + } + } + steps.append(step) + # we only run one cloud per test function for now + break + return {'steps': steps} + + +def main(): + # List of test files to include in the pipeline + test_files = os.listdir('tests/smoke_tests') + + for test_file in test_files: + if not test_file.startswith('test_'): + continue + test_file_path = os.path.join('tests/smoke_tests', test_file) + pipeline = _generate_pipeline(test_file_path) + yaml_file_path = '.buildkite/pipeline_smoke_' + \ + f'{test_file.split(".")[0]}.yaml' + with open(yaml_file_path, 'w', encoding='utf-8') as file: + file.write('# This is an auto-generated Buildkite pipeline by ' + '.buildkite/generate_pipeline.py, Please do not ' + 'edit directly.\n') + yaml.dump(pipeline, file, default_flow_style=False) + print(f'Convert {test_file_path} to {yaml_file_path}') + + +if __name__ == '__main__': + main() diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml deleted file mode 100644 index 4d8ed6ff8f0..00000000000 --- a/.buildkite/pipeline.yaml +++ /dev/null @@ -1,10 +0,0 @@ -steps: - - label: "test_minimal" - command: "pytest tests/smoke_tests/test_basic.py::test_minimal" - env: - LOG_TO_STDOUT: "1" - - - label: "test_launch_fast" - command: "pytest tests/smoke_tests/test_basic.py::test_launch_fast" - env: - LOG_TO_STDOUT: "1" diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml new file mode 100644 index 00000000000..9c775c1f5fb --- /dev/null +++ b/.buildkite/pipeline_smoke_test_basic.yaml @@ -0,0 +1,91 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + env: + LOG_TO_STDOUT: '1' + label: test_example_app on aws +- command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + env: + LOG_TO_STDOUT: '1' + label: test_minimal on aws +- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + env: + LOG_TO_STDOUT: '1' + label: test_launch_fast on aws +- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + --aws + env: + LOG_TO_STDOUT: '1' + label: test_launch_fast_with_autostop on aws +- command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + env: + LOG_TO_STDOUT: '1' + label: test_stale_job on aws +- command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_stale_job_manual_restart on aws +- command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_stale_job_manual_restart on gcp +- command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + env: + LOG_TO_STDOUT: '1' + label: test_env_check on aws +- command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + env: + LOG_TO_STDOUT: '1' + label: test_cli_logs on aws +- command: pytest tests/smoke_tests/test_basic.py::test_scp_logs --scp + env: + LOG_TO_STDOUT: '1' + label: test_scp_logs on scp +- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp + env: + LOG_TO_STDOUT: '1' + label: test_core_api_sky_launch_exec on gcp +- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws + env: + LOG_TO_STDOUT: '1' + label: test_core_api_sky_launch_fast on aws +- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + --aws + env: + LOG_TO_STDOUT: '1' + label: test_multiple_accelerators_ordered on aws +- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default + --aws + env: + LOG_TO_STDOUT: '1' + label: test_multiple_accelerators_ordered_with_default on aws +- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + --aws + env: + LOG_TO_STDOUT: '1' + label: test_multiple_accelerators_unordered on aws +- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + --aws + env: + LOG_TO_STDOUT: '1' + label: test_multiple_accelerators_unordered_with_default on aws +- command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + env: + LOG_TO_STDOUT: '1' + label: test_multiple_resources on aws +- command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws + env: + LOG_TO_STDOUT: '1' + label: test_sky_bench on aws +- command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_kubernetes_context_failover on kubernetes +- command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + --aws + env: + LOG_TO_STDOUT: '1' + label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml new file mode 100644 index 00000000000..3b81274a00a --- /dev/null +++ b/.buildkite/pipeline_smoke_test_cluster_job.yaml @@ -0,0 +1,265 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws + env: + LOG_TO_STDOUT: '1' + label: test_job_queue on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + --aws + env: + LOG_TO_STDOUT: '1' + label: test_job_queue_with_docker on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --lambda_cloud + env: + LOG_TO_STDOUT: '1' + label: test_lambda_job_queue on lambda_cloud +- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue --ibm + env: + LOG_TO_STDOUT: '1' + label: test_ibm_job_queue on ibm +- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_job_queue --scp + env: + LOG_TO_STDOUT: '1' + label: test_scp_job_queue on scp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + --aws + env: + LOG_TO_STDOUT: '1' + label: test_job_queue_multinode on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + env: + LOG_TO_STDOUT: '1' + label: test_large_job_queue on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + --aws + env: + LOG_TO_STDOUT: '1' + label: test_fast_large_job_queue on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue_multinode + --ibm + env: + LOG_TO_STDOUT: '1' + label: test_ibm_job_queue_multinode on ibm +- command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package + --aws + env: + LOG_TO_STDOUT: '1' + label: test_docker_preinstalled_package on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + env: + LOG_TO_STDOUT: '1' + label: test_multi_echo on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + env: + LOG_TO_STDOUT: '1' + label: test_huggingface on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --lambda_cloud + env: + LOG_TO_STDOUT: '1' + label: test_lambda_huggingface on lambda_cloud +- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_huggingface --scp + env: + LOG_TO_STDOUT: '1' + label: test_scp_huggingface on scp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + env: + LOG_TO_STDOUT: '1' + label: test_inferentia on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + env: + LOG_TO_STDOUT: '1' + label: test_tpu on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + env: + LOG_TO_STDOUT: '1' + label: test_tpu_vm on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + env: + LOG_TO_STDOUT: '1' + label: test_tpu_vm_pod on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_tpu_pod_slice_gke on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + env: + LOG_TO_STDOUT: '1' + label: test_multi_hostname on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws + env: + LOG_TO_STDOUT: '1' + label: test_multi_node_failure on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_http_server_with_custom_ports on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_http_server_with_custom_ports on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_http_server_with_custom_ports on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_kubernetes_http_server_with_custom_ports on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_paperspace_http_server_with_custom_ports + --paperspace + env: + LOG_TO_STDOUT: '1' + label: test_paperspace_http_server_with_custom_ports on paperspace +- command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports + --runpod + env: + LOG_TO_STDOUT: '1' + label: test_runpod_http_server_with_custom_ports on runpod +- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + env: + LOG_TO_STDOUT: '1' + label: test_task_labels_aws on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + env: + LOG_TO_STDOUT: '1' + label: test_task_labels_gcp on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_task_labels_kubernetes on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_container_logs_multinode_kubernetes on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_container_logs_two_jobs_kubernetes on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + env: + LOG_TO_STDOUT: '1' + label: test_distributed_tf on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_start_stop on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_start_stop on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + env: + LOG_TO_STDOUT: '1' + label: test_autostop on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + env: + LOG_TO_STDOUT: '1' + label: test_autodown on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_autodown --scp + env: + LOG_TO_STDOUT: '1' + label: test_scp_autodown on scp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + env: + LOG_TO_STDOUT: '1' + label: test_cancel_aws on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + env: + LOG_TO_STDOUT: '1' + label: test_cancel_gcp on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + env: + LOG_TO_STDOUT: '1' + label: test_cancel_azure on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws + env: + LOG_TO_STDOUT: '1' + label: test_cancel_pytorch on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_ibm --ibm + env: + LOG_TO_STDOUT: '1' + label: test_cancel_ibm on ibm +- command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + env: + LOG_TO_STDOUT: '1' + label: test_use_spot on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp + env: + LOG_TO_STDOUT: '1' + label: test_stop_gcp_spot on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + env: + LOG_TO_STDOUT: '1' + label: test_inline_env on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + env: + LOG_TO_STDOUT: '1' + label: test_inline_env_file on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_custom_image on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_kubernetes_custom_image on kubernetes +- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_start_stop_two_nodes on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_disk_tier on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_disk_tier on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_disk_tier on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_best_tier_failover on azure +- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_zero_quota_failover on aws +- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_zero_quota_failover on gcp +- command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + --aws + env: + LOG_TO_STDOUT: '1' + label: test_long_setup_run_script on aws diff --git a/.buildkite/pipeline_smoke_test_images.yaml b/.buildkite/pipeline_smoke_test_images.yaml new file mode 100644 index 00000000000..4991fccbbc7 --- /dev/null +++ b/.buildkite/pipeline_smoke_test_images.yaml @@ -0,0 +1,66 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_images on aws +- command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_images on gcp +- command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_images on azure +- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_image_id_dict on aws +- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_image_id_dict on gcp +- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_image_id_dict_region on aws +- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_image_id_dict_region on gcp +- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_image_id_dict_zone on aws +- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_image_id_dict_zone on gcp +- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + env: + LOG_TO_STDOUT: '1' + label: test_clone_disk_aws on aws +- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + env: + LOG_TO_STDOUT: '1' + label: test_clone_disk_gcp on gcp +- command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_mig on gcp +- command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_force_enable_external_ips on gcp +- command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + env: + LOG_TO_STDOUT: '1' + label: test_image_no_conda on aws +- command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws + env: + LOG_TO_STDOUT: '1' + label: test_custom_default_conda_env on aws diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml new file mode 100644 index 00000000000..cda2b87a53c --- /dev/null +++ b/.buildkite/pipeline_smoke_test_managed_job.yaml @@ -0,0 +1,79 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_job_pipeline on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_failed_setup on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_pipeline_failed_setup on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + --aws + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_recovery_aws on aws +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_recovery_gcp on gcp +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + --aws + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_pipeline_recovery_aws on aws +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_pipeline_recovery_gcp on gcp +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_recovery_default_resources on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + --aws + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_recovery_multi_node_aws on aws +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_recovery_multi_node_gcp on gcp +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + --aws + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_cancellation_aws on aws +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_cancellation_gcp on gcp +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_storage on managed_jobs +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_tpu on gcp +- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + --managed_jobs + env: + LOG_TO_STDOUT: '1' + label: test_managed_jobs_inline_env on managed_jobs diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml new file mode 100644 index 00000000000..6f1d11e7804 --- /dev/null +++ b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml @@ -0,0 +1,139 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws + env: + LOG_TO_STDOUT: '1' + label: test_file_mounts on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_scp_file_mounts + --scp + env: + LOG_TO_STDOUT: '1' + label: test_scp_file_mounts on scp +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + --aws + env: + LOG_TO_STDOUT: '1' + label: test_using_file_mounts_with_env_vars on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_storage_mounts_with_stop on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_storage_mounts_with_stop on gcp +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_storage_mounts_with_stop on azure +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_kubernetes_storage_mounts on kubernetes +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_kubernetes_context_switch on kubernetes +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts + --aws + env: + LOG_TO_STDOUT: '1' + label: test_docker_storage_mounts on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_cloudflare_storage_mounts + --cloudflare + env: + LOG_TO_STDOUT: '1' + label: test_cloudflare_storage_mounts on cloudflare +- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_ibm_storage_mounts + --ibm + env: + LOG_TO_STDOUT: '1' + label: test_ibm_storage_mounts on ibm +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on + aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_bucket_external_deletion on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_public_bucket on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_nonexistent_bucket on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_private_bucket on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_list_source on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_invalid_names on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_aws_regions on aws +- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + --aws + env: + LOG_TO_STDOUT: '1' + label: TestStorageWithCredentials::test_gcs_regions on aws diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml new file mode 100644 index 00000000000..ae38eb4b594 --- /dev/null +++ b/.buildkite/pipeline_smoke_test_region_and_zone.yaml @@ -0,0 +1,36 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_region on aws +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_with_ssh_proxy_command on aws +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_region_and_service_account on gcp +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_region --ibm + env: + LOG_TO_STDOUT: '1' + label: test_ibm_region on ibm +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + env: + LOG_TO_STDOUT: '1' + label: test_azure_region on azure +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + env: + LOG_TO_STDOUT: '1' + label: test_aws_zone on aws +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_zone --ibm + env: + LOG_TO_STDOUT: '1' + label: test_ibm_zone on ibm +- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + env: + LOG_TO_STDOUT: '1' + label: test_gcp_zone on gcp diff --git a/.buildkite/pipeline_smoke_test_required_before_merge.yaml b/.buildkite/pipeline_smoke_test_required_before_merge.yaml new file mode 100644 index 00000000000..8a29f838e4e --- /dev/null +++ b/.buildkite/pipeline_smoke_test_required_before_merge.yaml @@ -0,0 +1,7 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --aws + env: + LOG_TO_STDOUT: '1' + label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml new file mode 100644 index 00000000000..0fd84641780 --- /dev/null +++ b/.buildkite/pipeline_smoke_test_sky_serve.yaml @@ -0,0 +1,106 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_gcp_http on gcp +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_aws_http on aws +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_azure_http on azure +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_kubernetes_http on kubernetes +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_oci_http --oci + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_oci_http on oci +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_llm on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_spot_recovery on gcp +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_base_ondemand_fallback on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_dynamic_ondemand_fallback on gcp +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_user_bug_restart on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_load_balancer on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_auto_restart on gcp +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_cancel on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_streaming on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_readiness_timeout_fail on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_large_readiness_timeout on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_update on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_rolling_update on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_fast_update on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_update_autoscale on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update + --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_new_autoscaler_update on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --serve + env: + LOG_TO_STDOUT: '1' + label: test_skyserve_failures on serve +- command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + env: + LOG_TO_STDOUT: '1' + label: test_user_dependencies on aws diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index c0996e135d0..0090ae957b8 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -1,39 +1,45 @@ -# Smoke tests for SkyPilot +# Smoke tests for SkyPilot for basic functionality # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/test_smoke.py +# > pytest tests/smoke_tests/test_basic.py # # Terminate failed clusters after test finishes -# > pytest tests/test_smoke.py --terminate-on-failure +# > pytest tests/smoke_tests/test_basic.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/test_smoke.py::test_minimal -# -# Only run managed job tests -# > pytest tests/test_smoke.py --managed-jobs -# -# Only run sky serve tests -# > pytest tests/test_smoke.py --sky-serve +# > pytest tests/smoke_tests/test_basic.py::test_minimal # # Only run test for AWS + generic tests -# > pytest tests/test_smoke.py --aws +# > pytest tests/smoke_tests/test_basic.py --aws # # Change cloud for generic tests to aws -# > pytest tests/test_smoke.py --generic-cloud aws +# > pytest tests/smoke_tests/test_basic.py --generic-cloud aws + +import pathlib +import subprocess +import tempfile +import textwrap +import time import pytest -from smoke_tests.util import _get_cluster_name -from smoke_tests.util import _get_timeout -from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_timeout from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_TYPE from smoke_tests.util import Test +from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT +from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB +import sky +from sky.skylet import events +from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus +from sky.utils import common_utils # ---------- Dry run: 2 Tasks in a chain. ---------- @@ -48,16 +54,16 @@ def test_example_app(): # ---------- A minimal task ---------- def test_minimal(generic_cloud: str): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'minimal', [ - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', # Output validation done. f'sky logs {name} 1 --status', f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. # Test launch output again on existing cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. # Check the logs downloading @@ -83,20 +89,20 @@ def test_minimal(generic_cloud: str): f'sky exec -c {name} && exit 1 || true', ], f'sky down -y {name}', - _get_timeout(generic_cloud), + get_timeout(generic_cloud), ) run_one_test(test) # ---------- Test fast launch ---------- def test_launch_fast(generic_cloud: str): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'test_launch_fast', [ # First launch to create the cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', # Second launch to test fast launch - should not reprovision @@ -112,7 +118,7 @@ def test_launch_fast(generic_cloud: str): f'sky status -r {name} | grep UP', ], f'sky down -y {name}', - timeout=_get_timeout(generic_cloud), + timeout=get_timeout(generic_cloud), ) run_one_test(test) @@ -123,7 +129,7 @@ def test_launch_fast(generic_cloud: str): @pytest.mark.no_ibm @pytest.mark.no_kubernetes def test_launch_fast_with_autostop(generic_cloud: str): - name = _get_cluster_name() + name = get_cluster_name() # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure # the VM is stopped. autostop_timeout = 600 if generic_cloud == 'azure' else 250 @@ -131,22 +137,471 @@ def test_launch_fast_with_autostop(generic_cloud: str): 'test_launch_fast_with_autostop', [ # First launch to create the cluster with a short autostop - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', f'sky status -r {name} | grep UP', # Ensure cluster is stopped - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( cluster_name=name, cluster_status=ClusterStatus.STOPPED.value, timeout=autostop_timeout), # Launch again. Do full output validation - we expect the cluster to re-launch - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', f'sky status -r {name} | grep UP', ], f'sky down -y {name}', - timeout=_get_timeout(generic_cloud) + autostop_timeout, + timeout=get_timeout(generic_cloud) + autostop_timeout, + ) + run_one_test(test) + + +# ------------ Test stale job ------------ +@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances +@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances +def test_stale_job(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'stale_job', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', + f'sky exec {name} -d "echo start; sleep 10000"', + f'sky stop {name} -y', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=100), + f'sky start {name} -y', + f'sky logs {name} 1 --status', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.aws +def test_aws_stale_job_manual_restart(): + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.AWS.max_cluster_name_length()) + region = 'us-east-2' + test = Test( + 'aws_stale_job_manual_restart', + [ + f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"', + f'sky exec {name} -d "echo start; sleep 10000"', + # Stop the cluster manually. + f'id=`aws ec2 describe-instances --region {region} --filters ' + f'Name=tag:ray-cluster-name,Values={name_on_cloud} ' + f'--query Reservations[].Instances[].InstanceId ' + '--output text`; ' + f'aws ec2 stop-instances --region {region} ' + '--instance-ids $id', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), + f'sky launch -c {name} -y "echo hi"', + f'sky logs {name} 1 --status', + f'sky logs {name} 3 --status', + # Ensure the skylet updated the stale job status. + WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + cluster_name=name, + job_status=JobStatus.FAILED_DRIVER.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_stale_job_manual_restart(): + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.GCP.max_cluster_name_length()) + zone = 'us-west2-a' + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name={name_on_cloud})" ' + f'--zones={zone} --format="value(name)"') + stop_cmd = (f'gcloud compute instances stop --zone={zone}' + f' --quiet $({query_cmd})') + test = Test( + 'gcp_stale_job_manual_restart', + [ + f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"', + f'sky exec {name} -d "echo start; sleep 10000"', + # Stop the cluster manually. + stop_cmd, + 'sleep 40', + f'sky launch -c {name} -y "echo hi"', + f'sky logs {name} 1 --status', + f'sky logs {name} 3 --status', + # Ensure the skylet updated the stale job status. + WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + cluster_name=name, + job_status=JobStatus.FAILED_DRIVER.value, + timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Check Sky's environment variables; workdir. ---------- +@pytest.mark.no_fluidstack # Requires amazon S3 +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +def test_env_check(generic_cloud: str): + name = get_cluster_name() + total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 + test = Test( + 'env_check', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=total_timeout_minutes * 60, + ) + run_one_test(test) + + +# ---------- CLI logs ---------- +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead. +def test_cli_logs(generic_cloud: str): + name = get_cluster_name() + num_nodes = 2 + if generic_cloud == 'kubernetes': + # Kubernetes does not support multi-node + num_nodes = 1 + timestamp = time.time() + test = Test('cli_logs', [ + f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"', + f'sky exec {name} "echo {timestamp} 2"', + f'sky exec {name} "echo {timestamp} 3"', + f'sky exec {name} "echo {timestamp} 4"', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 4 --sync-down', + f'sky logs {name} * --sync-down', + f'sky logs {name} 1 | grep "{timestamp} 1"', + f'sky logs {name} | grep "{timestamp} 4"', + ], f'sky down -y {name}') + run_one_test(test) + + +@pytest.mark.scp +def test_scp_logs(): + name = get_cluster_name() + timestamp = time.time() + test = Test( + 'SCP_cli_logs', + [ + f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"', + f'sky exec {name} "echo {timestamp} 2"', + f'sky exec {name} "echo {timestamp} 3"', + f'sky exec {name} "echo {timestamp} 4"', + f'sky logs {name} 2 --status', + f'sky logs {name} 3 4 --sync-down', + f'sky logs {name} * --sync-down', + f'sky logs {name} 1 | grep "{timestamp} 1"', + f'sky logs {name} | grep "{timestamp} 4"', + ], + f'sky down -y {name}', ) run_one_test(test) + + +# ------- Testing the core API -------- +# Most of the core APIs have been tested in the CLI tests. +# These tests are for testing the return value of the APIs not fully used in CLI. + + +@pytest.mark.gcp +def test_core_api_sky_launch_exec(): + name = get_cluster_name() + task = sky.Task(run="whoami") + task.set_resources(sky.Resources(cloud=sky.GCP())) + job_id, handle = sky.launch(task, cluster_name=name) + assert job_id == 1 + assert handle is not None + assert handle.cluster_name == name + assert handle.launched_resources.cloud.is_same_cloud(sky.GCP()) + job_id_exec, handle_exec = sky.exec(task, cluster_name=name) + assert job_id_exec == 2 + assert handle_exec is not None + assert handle_exec.cluster_name == name + assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP()) + # For dummy task (i.e. task.run is None), the job won't be submitted. + dummy_task = sky.Task() + job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name) + assert job_id_dummy is None + sky.down(name) + + +# The sky launch CLI has some additional checks to make sure the cluster is up/ +# restarted. However, the core API doesn't have these; make sure it still works +def test_core_api_sky_launch_fast(generic_cloud: str): + name = get_cluster_name() + cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud) + try: + task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud)) + sky.launch(task, + cluster_name=name, + idle_minutes_to_autostop=1, + fast=True) + # Sleep to let the cluster autostop + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED, + timeout=120) + # Run it again - should work with fast=True + sky.launch(task, + cluster_name=name, + idle_minutes_to_autostop=1, + fast=True) + finally: + sky.down(name) + + +# ---------- Testing YAML Specs ---------- +# Our sky storage requires credentials to check the bucket existance when +# loading a task from the yaml file, so we cannot make it a unit test. +class TestYamlSpecs: + # TODO(zhwu): Add test for `to_yaml_config` for the Storage object. + # We should not use `examples/storage_demo.yaml` here, since it requires + # users to ensure bucket names to not exist and/or be unique. + _TEST_YAML_PATHS = [ + 'examples/minimal.yaml', 'examples/managed_job.yaml', + 'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml', + 'examples/multi_hostname.yaml' + ] + + def _is_dict_subset(self, d1, d2): + """Check if d1 is the subset of d2.""" + for k, v in d1.items(): + if k not in d2: + if isinstance(v, list) or isinstance(v, dict): + assert len(v) == 0, (k, v) + else: + assert False, (k, v) + elif isinstance(v, dict): + assert isinstance(d2[k], dict), (k, v, d2) + self._is_dict_subset(v, d2[k]) + elif isinstance(v, str): + if k == 'accelerators': + resources = sky.Resources() + resources._set_accelerators(v, None) + assert resources.accelerators == d2[k], (k, v, d2) + else: + assert v.lower() == d2[k].lower(), (k, v, d2[k]) + else: + assert v == d2[k], (k, v, d2[k]) + + def _check_equivalent(self, yaml_path): + """Check if the yaml is equivalent after load and dump again.""" + origin_task_config = common_utils.read_yaml(yaml_path) + + task = sky.Task.from_yaml(yaml_path) + new_task_config = task.to_yaml_config() + # d1 <= d2 + print(origin_task_config, new_task_config) + self._is_dict_subset(origin_task_config, new_task_config) + + def test_load_dump_yaml_config_equivalent(self): + """Test if the yaml config is equivalent after load and dump again.""" + pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True) + pathlib.Path('~/tmpfile').expanduser().touch() + pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True) + pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch() + pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True) + pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True, + exist_ok=True) + for yaml_path in self._TEST_YAML_PATHS: + self._check_equivalent(yaml_path) + + +# ---------- Testing Multiple Accelerators ---------- +@pytest.mark.no_fluidstack # Fluidstack does not support K80 gpus for now +@pytest.mark.no_paperspace # Paperspace does not support K80 gpus +def test_multiple_accelerators_ordered(): + name = get_cluster_name() + test = Test( + 'multiple-accelerators-ordered', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs +@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs +def test_multiple_accelerators_ordered_with_default(): + name = get_cluster_name() + test = Test( + 'multiple-accelerators-ordered', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status {name} | grep Spot', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs +@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs +def test_multiple_accelerators_unordered(): + name = get_cluster_name() + test = Test( + 'multiple-accelerators-unordered', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs +@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs +def test_multiple_accelerators_unordered_with_default(): + name = get_cluster_name() + test = Test( + 'multiple-accelerators-unordered-with-default', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky status {name} | grep Spot', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Requires other clouds to be enabled +def test_multiple_resources(): + name = get_cluster_name() + test = Test( + 'multiple-resources', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Sky Benchmark ---------- +@pytest.mark.no_fluidstack # Requires other clouds to be enabled +@pytest.mark.no_paperspace # Requires other clouds to be enabled +@pytest.mark.no_kubernetes +@pytest.mark.aws # SkyBenchmark requires S3 access +def test_sky_bench(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'sky-bench', + [ + f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml', + 'sleep 120', + f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED', + ], + f'sky bench down {name} -y; sky bench delete {name} -y', + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_kubernetes_context_failover(): + """Test if the kubernetes context failover works. + + This test requires two kubernetes clusters: + - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs. + - another accessible cluster: with enough CPUs + To start the first cluster, run: + sky local up + # Add mock label for accelerator + kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot + # Get the token for the cluster in context kind-skypilot + TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\') + # Get the API URL for the cluster in context kind-skypilot + API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\') + # Add mock capacity for GPU + curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status" + # Add a new namespace to test the handling of namespaces + kubectl create namespace test-namespace --context kind-skypilot + # Set the namespace to test-namespace + kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot + """ + # Get context that is not kind-skypilot + contexts = subprocess.check_output('kubectl config get-contexts -o name', + shell=True).decode('utf-8').split('\n') + context = [context for context in contexts if context != 'kind-skypilot'][0] + config = textwrap.dedent(f"""\ + kubernetes: + allowed_contexts: + - kind-skypilot + - {context} + """) + with tempfile.NamedTemporaryFile(delete=True) as f: + f.write(config.encode('utf-8')) + f.flush() + name = get_cluster_name() + test = Test( + 'kubernetes-context-failover', + [ + # Check if kind-skypilot is provisioned with H100 annotations already + 'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && ' + 'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && ' + 'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || ' + '{ echo "kind-skypilot does not exist ' + 'or does not have mock labels for GPUs. Check the instructions in ' + 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', + # Check namespace for kind-skypilot is test-namespace + 'kubectl get namespaces --context kind-skypilot | grep test-namespace || ' + '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in ' + 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', + 'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"', + # Get contexts and set current context to the other cluster that is not kind-skypilot + f'kubectl config use-context {context}', + # H100 should not in the current context + '! sky show-gpus --cloud kubernetes | grep H100', + f'sky launch -y -c {name}-1 --cpus 1 echo hi', + f'sky logs {name}-1 --status', + # It should be launched not on kind-skypilot + f'sky status -a {name}-1 | grep "{context}"', + # Test failure for launching H100 on other cluster + f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true', + # Test failover + f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi', + f'sky logs {name}-3 --status', + # Test pods + f'kubectl get pods --context kind-skypilot | grep "{name}-3"', + # It should be launched on kind-skypilot + f'sky status -a {name}-3 | grep "kind-skypilot"', + # Should be 7 free GPUs + f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep " 7"', + # Remove the line with "kind-skypilot" + f'sed -i "/kind-skypilot/d" {f.name}', + # Should still be able to exec and launch on existing cluster + f'sky exec {name}-3 "echo hi"', + f'sky logs {name}-3 --status', + f'sky status -r {name}-3 | grep UP', + f'sky launch -c {name}-3 --gpus h100 echo hi', + f'sky logs {name}-3 --status', + f'sky status -r {name}-3 | grep UP', + ], + f'sky down -y {name}-1 {name}-3', + env={'SKYPILOT_CONFIG': f.name}, + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py new file mode 100644 index 00000000000..22b6d9dc8f0 --- /dev/null +++ b/tests/smoke_tests/test_cluster_job.py @@ -0,0 +1,1657 @@ +# Smoke tests for SkyPilot for sky launched cluster and cluster job +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_cluster_job.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_cluster_job.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_cluster_job.py::test_job_queue +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_cluster_job.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_cluster_job.py --generic-cloud aws + +import pathlib +import tempfile +import textwrap + +import jinja2 +import pytest +from smoke_tests.util import BUMP_UP_SECONDS +from smoke_tests.util import get_aws_region_for_quota_failover +from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_gcp_region_for_quota_failover +from smoke_tests.util import get_timeout +from smoke_tests.util import LAMBDA_TYPE +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_GPU_V100 +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import Test +from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID + +import sky +from sky import AWS +from sky import Azure +from sky import GCP +from sky.skylet import constants +from sky.skylet.job_lib import JobStatus +from sky.status_lib import ClusterStatus +from sky.utils import common_utils +from sky.utils import resources_utils + + +# ---------- Job Queue. ---------- +@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus +@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead +@pytest.mark.no_scp # SCP does not have T4 gpus. Run test_scp_job_queue instead +@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. +@pytest.mark.no_oci # OCI does not have T4 gpus +def test_job_queue(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'job_queue', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml', + f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', + f'sky cancel -y {name} 2', + 'sleep 5', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 3', + f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky logs {name} 4 --status', + f'sky logs {name} 5 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Job Queue with Docker. ---------- +@pytest.mark.no_fluidstack # FluidStack does not support docker for now +@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now +@pytest.mark.no_ibm # Doesn't support IBM Cloud for now +@pytest.mark.no_paperspace # Paperspace doesn't have T4 GPUs +@pytest.mark.no_scp # Doesn't support SCP for now +@pytest.mark.no_oci # Doesn't support OCI for now +@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now +@pytest.mark.parametrize( + 'image_id', + [ + 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', + 'docker:ubuntu:18.04', + # Test latest image with python 3.11 installed by default. + 'docker:continuumio/miniconda3:24.1.2-0', + # Test python>=3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', + # Axolotl image is a good example custom image that has its conda path + # set in PATH with dockerfile and uses python>=3.12. It could test: + # 1. we handle the env var set in dockerfile correctly + # 2. python>=3.12 works with SkyPilot runtime. + 'docker:winglian/axolotl:main-latest' + ]) +def test_job_queue_with_docker(generic_cloud: str, image_id: str): + name = get_cluster_name() + image_id[len('docker:'):][:4] + total_timeout_minutes = 40 if generic_cloud == 'azure' else 15 + time_to_sleep = 300 if generic_cloud == 'azure' else 180 + test = Test( + 'job_queue_with_docker', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', + f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', + f'sky cancel -y {name} 2', + 'sleep 5', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 3', + # Make sure the GPU is still visible to the container. + f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', + f'sky logs {name} 4 --status', + f'sky stop -y {name}', + # Make sure the job status preserve after stop and start the + # cluster. This is also a test for the docker container to be + # preserved after stop and start. + f'sky start -y {name}', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', + f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + # Make sure it is still visible after an stop & start cycle. + f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', + f'sky logs {name} 7 --status' + ], + f'sky down -y {name}', + timeout=total_timeout_minutes * 60, + ) + run_one_test(test) + + +@pytest.mark.lambda_cloud +def test_lambda_job_queue(): + name = get_cluster_name() + test = Test( + 'lambda_job_queue', + [ + f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml', + f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml', + f'sky queue {name} | grep {name}-1 | grep RUNNING', + f'sky queue {name} | grep {name}-2 | grep RUNNING', + f'sky queue {name} | grep {name}-3 | grep PENDING', + f'sky cancel -y {name} 2', + 'sleep 5', + f'sky queue {name} | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.ibm +def test_ibm_job_queue(): + name = get_cluster_name() + test = Test( + 'ibm_job_queue', + [ + f'sky launch -y -c {name} --cloud ibm --gpus v100', + f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml', + f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml', + f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml', + f'sky queue {name} | grep {name}-1 | grep RUNNING', + f'sky queue {name} | grep {name}-2 | grep RUNNING', + f'sky queue {name} | grep {name}-3 | grep PENDING', + f'sky cancel -y {name} 2', + 'sleep 5', + f'sky queue {name} | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.scp +def test_scp_job_queue(): + name = get_cluster_name() + num_of_gpu_launch = 1 + num_of_gpu_exec = 0.5 + test = Test( + 'SCP_job_queue', + [ + f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml', + f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky queue {name} | grep {name}-1 | grep RUNNING', + f'sky queue {name} | grep {name}-2 | grep RUNNING', + f'sky queue {name} | grep {name}-3 | grep PENDING', + f'sky cancel -y {name} 2', + 'sleep 5', + f'sky queue {name} | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus +@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead +@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +@pytest.mark.no_oci # OCI Cloud does not have T4 gpus. +@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet +def test_job_queue_multinode(generic_cloud: str): + name = get_cluster_name() + total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 + test = Test( + 'job_queue_multinode', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml', + f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml', + f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml', + f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml', + f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', + f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', + f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', + 'sleep 90', + f'sky cancel -y {name} 1', + 'sleep 5', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP', + f'sky cancel -y {name} 1 2 3', + f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml', + # Test the job status is correctly set to SETTING_UP, during the setup is running, + # and the job can be cancelled during the setup. + 'sleep 5', + f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', + f'sky cancel -y {name} 4', + f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', + f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + timeout=total_timeout_minutes * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs +@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs +def test_large_job_queue(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'large_job_queue', + [ + f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', + f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', + f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', + 'sleep 90', + + # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. + # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', + # Make sure the jobs are scheduled in FIFO order + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' + for i in range(1, 17) + ], + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' + for i in range(17, 33) + ], + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING' + for i in range(33, 75) + ], + f'sky cancel -y {name} 33 35 37 39 17 18 19', + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' + for i in range(33, 40, 2) + ], + 'sleep 10', + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' + for i in [34, 36, 38] + ], + ], + f'sky down -y {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs +@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs +def test_fast_large_job_queue(generic_cloud: str): + # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. + name = get_cluster_name() + test = Test( + 'fast_large_job_queue', + [ + f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', + f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done', + 'sleep 60', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32', + ], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.ibm +def test_ibm_job_queue_multinode(): + name = get_cluster_name() + task_file = 'examples/job_queue/job_multinode_ibm.yaml' + test = Test( + 'ibm_job_queue_multinode', + [ + f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2', + f'sky exec {name} -n {name}-1 -d {task_file}', + f'sky exec {name} -n {name}-2 -d {task_file}', + f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}', + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)', + 'sleep 90', + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', + f'sky cancel -y {name} 1', + 'sleep 5', + f'sky queue {name} | grep {name}-3 | grep RUNNING', + f'sky cancel -y {name} 1 2 3', + f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}', + # Test the job status is correctly set to SETTING_UP, during the setup is running, + # and the job can be cancelled during the setup. + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', + f'sky cancel -y {name} 4', + f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', + f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + f'sky logs {name} 7 --status', + ], + f'sky down -y {name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +# ---------- Docker with preinstalled package. ---------- +@pytest.mark.no_fluidstack # Doesn't support Fluidstack for now +@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now +@pytest.mark.no_ibm # Doesn't support IBM Cloud for now +@pytest.mark.no_scp # Doesn't support SCP for now +@pytest.mark.no_oci # Doesn't support OCI for now +@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now +# TODO(zhwu): we should fix this for kubernetes +def test_docker_preinstalled_package(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'docker_with_preinstalled_package', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx', + f'sky exec {name} "nginx -V"', + f'sky logs {name} 1 --status', + f'sky exec {name} whoami | grep root', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Submitting multiple tasks to the same cluster. ---------- +@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus +@pytest.mark.no_paperspace # Paperspace does not have T4 gpus +@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +@pytest.mark.no_oci # OCI Cloud does not have T4 gpus +def test_multi_echo(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'multi_echo', + [ + f'python examples/multi_echo.py {name} {generic_cloud}', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', + 'sleep 10', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', + 'sleep 30', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', + 'sleep 30', + # Make sure that our job scheduler is fast enough to have at least + # 10 RUNNING jobs in parallel. + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'', + 'sleep 30', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', + f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done', + ] + + # Ensure jobs succeeded. + [ + WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=name, + job_id=i + 1, + job_status=JobStatus.SUCCEEDED.value, + timeout=120) for i in range(32) + ] + + # Ensure monitor/autoscaler didn't crash on the 'assert not + # unfulfilled' error. If process not found, grep->ssh returns 1. + [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +# ---------- Task: 1 node training. ---------- +@pytest.mark.no_fluidstack # Fluidstack does not have T4 gpus for now +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus +@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA +@pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. +def test_huggingface(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'huggingface_glue_imdb_app', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.lambda_cloud +def test_lambda_huggingface(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'lambda_huggingface_glue_imdb_app', + [ + f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.scp +def test_scp_huggingface(generic_cloud: str): + name = get_cluster_name() + num_of_gpu_launch = 1 + test = Test( + 'SCP_huggingface_glue_imdb_app', + [ + f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Inferentia. ---------- +@pytest.mark.aws +def test_inferentia(): + name = get_cluster_name() + test = Test( + 'test_inferentia', + [ + f'sky launch -y -c {name} -t inf2.xlarge -- echo hi', + f'sky exec {name} --gpus Inferentia:1 echo hi', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky logs {name} 2 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- TPU. ---------- +@pytest.mark.gcp +@pytest.mark.tpu +def test_tpu(): + name = get_cluster_name() + test = Test( + 'tpu_app', + [ + f'sky launch -y -c {name} examples/tpu/tpu_app.yaml', + f'sky logs {name} 1', # Ensure the job finished. + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. + ], + f'sky down -y {name}', + timeout=30 * 60, # can take >20 mins + ) + run_one_test(test) + + +# ---------- TPU VM. ---------- +@pytest.mark.gcp +@pytest.mark.tpu +def test_tpu_vm(): + name = get_cluster_name() + test = Test( + 'tpu_vm_app', + [ + f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml', + f'sky logs {name} 1', # Ensure the job finished. + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky stop -y {name}', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', # Ensure the cluster is STOPPED. + # Use retry: guard against transient errors observed for + # just-stopped TPU VMs (#962). + f'sky start --retry-until-up -y {name}', + f'sky exec {name} examples/tpu/tpuvm_mnist.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky stop -y {name}', + ], + f'sky down -y {name}', + timeout=30 * 60, # can take 30 mins + ) + run_one_test(test) + + +# ---------- TPU VM Pod. ---------- +@pytest.mark.gcp +@pytest.mark.tpu +def test_tpu_vm_pod(): + name = get_cluster_name() + test = Test( + 'tpu_pod', + [ + f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a', + f'sky logs {name} 1', # Ensure the job finished. + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=30 * 60, # can take 30 mins + ) + run_one_test(test) + + +# ---------- TPU Pod Slice on GKE. ---------- +@pytest.mark.kubernetes +def test_tpu_pod_slice_gke(): + name = get_cluster_name() + test = Test( + 'tpu_pod_slice_gke', + [ + f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', + f'sky logs {name} 1', # Ensure the job finished. + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. + f'sky logs {name} 2 --status' + ], + f'sky down -y {name}', + timeout=30 * 60, # can take 30 mins + ) + run_one_test(test) + + +# ---------- Simple apps. ---------- +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +def test_multi_hostname(generic_cloud: str): + name = get_cluster_name() + total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 + test = Test( + 'multi_hostname', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2', # Ensure there are 2 hosts. + f'sky exec {name} examples/multi_hostname.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=get_timeout(generic_cloud, total_timeout_minutes * 60), + ) + run_one_test(test) + + +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +def test_multi_node_failure(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'multi_node_failure', + [ + # TODO(zhwu): we use multi-thread to run the commands in setup + # commands in parallel, which makes it impossible to fail fast + # when one of the nodes fails. We should fix this in the future. + # The --detach-setup version can fail fast, as the setup is + # submitted to the remote machine, which does not use multi-thread. + # Refer to the comment in `subprocess_utils.run_in_parallel`. + # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1', # Ensure the job setup failed. + f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml', + f'sky logs {name} 1 --status | grep FAILED_SETUP', # Ensure the job setup failed. + f'sky exec {name} tests/test_yamls/failed_worker_run.yaml', + f'sky logs {name} 2 --status | grep FAILED', # Ensure the job failed. + f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2', # Ensure there 2 of the hosts printed their hostname. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on GCP. ---------- +@pytest.mark.gcp +def test_gcp_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'gcp_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on AWS. ---------- +@pytest.mark.aws +def test_aws_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'aws_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on Azure. ---------- +@pytest.mark.azure +def test_azure_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'azure_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on Kubernetes. ---------- +@pytest.mark.kubernetes +def test_kubernetes_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'kubernetes_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi' + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on Paperspace. ---------- +@pytest.mark.paperspace +def test_paperspace_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'paperspace_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Web apps with custom ports on RunPod. ---------- +@pytest.mark.runpod +def test_runpod_http_server_with_custom_ports(): + name = get_cluster_name() + test = Test( + 'runpod_http_server_with_custom_ports', + [ + f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml', + f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', + # Retry a few times to avoid flakiness in ports being open. + f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Labels from task on AWS (instance_tags) ---------- +@pytest.mark.aws +def test_task_labels_aws(): + name = get_cluster_name() + template_str = pathlib.Path( + 'tests/test_yamls/test_labels.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(cloud='aws', region='us-east-1') + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test = Test( + 'task_labels_aws', + [ + f'sky launch -y -c {name} {file_path}', + # Verify with aws cli that the tags are set. + 'aws ec2 describe-instances ' + '--query "Reservations[*].Instances[*].InstanceId" ' + '--filters "Name=instance-state-name,Values=running" ' + f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" ' + '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" ' + '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" ' + '--region us-east-1 --output text', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Labels from task on GCP (labels) ---------- +@pytest.mark.gcp +def test_task_labels_gcp(): + name = get_cluster_name() + template_str = pathlib.Path( + 'tests/test_yamls/test_labels.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(cloud='gcp') + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test = Test( + 'task_labels_gcp', + [ + f'sky launch -y -c {name} {file_path}', + # Verify with gcloud cli that the tags are set + f'gcloud compute instances list --filter="name~\'^{name}\' AND ' + 'labels.inlinelabel1=\'inlinevalue1\' AND ' + 'labels.inlinelabel2=\'inlinevalue2\'" ' + '--format="value(name)" | grep .', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Labels from task on Kubernetes (labels) ---------- +@pytest.mark.kubernetes +def test_task_labels_kubernetes(): + name = get_cluster_name() + template_str = pathlib.Path( + 'tests/test_yamls/test_labels.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(cloud='kubernetes') + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test = Test( + 'task_labels_kubernetes', + [ + f'sky launch -y -c {name} {file_path}', + # Verify with kubectl that the labels are set. + 'kubectl get pods ' + '--selector inlinelabel1=inlinevalue1 ' + '--selector inlinelabel2=inlinevalue2 ' + '-o jsonpath=\'{.items[*].metadata.name}\' | ' + f'grep \'^{name}\'' + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Pod Annotations on Kubernetes ---------- +@pytest.mark.kubernetes +def test_add_pod_annotations_for_autodown_with_launch(): + name = get_cluster_name() + test = Test( + 'add_pod_annotations_for_autodown_with_launch', + [ + # Launch Kubernetes cluster with two nodes, each being head node and worker node. + # Autodown is set. + f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes', + # Get names of the pods containing cluster name. + f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', + f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', + # Describe the first pod and check for annotations. + 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', + 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', + # Describe the second pod and check for annotations. + 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', + 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop' + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_add_and_remove_pod_annotations_with_autostop(): + name = get_cluster_name() + test = Test( + 'add_and_remove_pod_annotations_with_autostop', + [ + # Launch Kubernetes cluster with two nodes, each being head node and worker node. + f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes', + # Set autodown on the cluster with 'autostop' command. + f'sky autostop -y {name} -i 20 --down', + # Get names of the pods containing cluster name. + f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', + f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', + # Describe the first pod and check for annotations. + 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', + 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', + # Describe the second pod and check for annotations. + 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', + 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', + # Cancel the set autodown to remove the annotations from the pods. + f'sky autostop -y {name} --cancel', + # Describe the first pod and check if annotations are removed. + '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', + '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', + # Describe the second pod and check if annotations are removed. + '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', + '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Container logs from task on Kubernetes ---------- +@pytest.mark.kubernetes +def test_container_logs_multinode_kubernetes(): + name = get_cluster_name() + task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' + head_logs = ('kubectl get pods ' + f' | grep {name} | grep head | ' + " awk '{print $1}' | xargs -I {} kubectl logs {}") + worker_logs = ('kubectl get pods ' + f' | grep {name} | grep worker |' + " awk '{print $1}' | xargs -I {} kubectl logs {}") + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + test = Test( + 'container_logs_multinode_kubernetes', + [ + f'sky launch -y -c {name} {task_yaml} --num-nodes 2', + f'{head_logs} | wc -l | grep 9', + f'{worker_logs} | wc -l | grep 9', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_container_logs_two_jobs_kubernetes(): + name = get_cluster_name() + task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' + pod_logs = ('kubectl get pods ' + f' | grep {name} | grep head |' + " awk '{print $1}' | xargs -I {} kubectl logs {}") + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + test = Test( + 'test_container_logs_two_jobs_kubernetes', + [ + f'sky launch -y -c {name} {task_yaml}', + f'{pod_logs} | wc -l | grep 9', + f'sky launch -y -c {name} {task_yaml}', + f'{pod_logs} | wc -l | grep 18', + f'{pod_logs} | grep 1 | wc -l | grep 2', + f'{pod_logs} | grep 2 | wc -l | grep 2', + f'{pod_logs} | grep 3 | wc -l | grep 2', + f'{pod_logs} | grep 4 | wc -l | grep 2', + f'{pod_logs} | grep 5 | wc -l | grep 2', + f'{pod_logs} | grep 6 | wc -l | grep 2', + f'{pod_logs} | grep 7 | wc -l | grep 2', + f'{pod_logs} | grep 8 | wc -l | grep 2', + f'{pod_logs} | grep 9 | wc -l | grep 2', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_container_logs_two_simultaneous_jobs_kubernetes(): + name = get_cluster_name() + task_yaml = 'tests/test_yamls/test_k8s_logs.yaml ' + pod_logs = ('kubectl get pods ' + f' | grep {name} | grep head |' + " awk '{print $1}' | xargs -I {} kubectl logs {}") + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + test = Test( + 'test_container_logs_two_simultaneous_jobs_kubernetes', + [ + f'sky launch -y -c {name}', + f'sky exec -c {name} -d {task_yaml}', + f'sky exec -c {name} -d {task_yaml}', + 'sleep 30', + f'{pod_logs} | wc -l | grep 18', + f'{pod_logs} | grep 1 | wc -l | grep 2', + f'{pod_logs} | grep 2 | wc -l | grep 2', + f'{pod_logs} | grep 3 | wc -l | grep 2', + f'{pod_logs} | grep 4 | wc -l | grep 2', + f'{pod_logs} | grep 5 | wc -l | grep 2', + f'{pod_logs} | grep 6 | wc -l | grep 2', + f'{pod_logs} | grep 7 | wc -l | grep 2', + f'{pod_logs} | grep 8 | wc -l | grep 2', + f'{pod_logs} | grep 9 | wc -l | grep 2', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Task: n=2 nodes with setups. ---------- +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus +@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +@pytest.mark.skip( + reason= + 'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.') +def test_distributed_tf(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'resnet_distributed_tf_app', + [ + # NOTE: running it twice will hang (sometimes?) - an app-level bug. + f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=25 * 60, # 25 mins (it takes around ~19 mins) + ) + run_one_test(test) + + +# ---------- Testing GCP start and stop instances ---------- +@pytest.mark.gcp +def test_gcp_start_stop(): + name = get_cluster_name() + test = Test( + 'gcp-start-stop', + [ + f'sky launch -y -c {name} examples/gcp_start_stop.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} examples/gcp_start_stop.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. + f'sky logs {name} 3 --status', # Ensure the job succeeded. + f'sky stop -y {name}', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=40), + f'sky start -y {name} -i 1', + f'sky exec {name} examples/gcp_start_stop.yaml', + f'sky logs {name} 4 --status', # Ensure the job succeeded. + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=200), + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Testing Azure start and stop instances ---------- +@pytest.mark.azure +def test_azure_start_stop(): + name = get_cluster_name() + test = Test( + 'azure-start-stop', + [ + f'sky launch -y -c {name} examples/azure_start_stop.yaml', + f'sky exec {name} examples/azure_start_stop.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky stop -y {name}', + f'sky start -y {name} -i 1', + f'sky exec {name} examples/azure_start_stop.yaml', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + timeout=280) + + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', + ], + f'sky down -y {name}', + timeout=30 * 60, # 30 mins + ) + run_one_test(test) + + +# ---------- Testing Autostopping ---------- +@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances +@pytest.mark.no_ibm # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +@pytest.mark.no_kubernetes # Kubernetes does not autostop yet +def test_autostop(generic_cloud: str): + name = get_cluster_name() + # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure + # the VM is stopped. + autostop_timeout = 600 if generic_cloud == 'azure' else 250 + # Launching and starting Azure clusters can take a long time too. e.g., restart + # a stopped Azure cluster can take 7m. So we set the total timeout to 70m. + total_timeout_minutes = 70 if generic_cloud == 'azure' else 20 + test = Test( + 'autostop', + [ + f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', + f'sky autostop -y {name} -i 1', + + # Ensure autostop is set. + f'sky status | grep {name} | grep "1m"', + + # Ensure the cluster is not stopped early. + 'sleep 40', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', + + # Ensure the cluster is STOPPED. + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), + + # Ensure the cluster is UP and the autostop setting is reset ('-'). + f'sky start -y {name}', + f'sky status | grep {name} | grep -E "UP\s+-"', + + # Ensure the job succeeded. + f'sky exec {name} tests/test_yamls/minimal.yaml', + f'sky logs {name} 2 --status', + + # Test restarting the idleness timer via reset: + f'sky autostop -y {name} -i 1', # Idleness starts counting. + 'sleep 40', # Almost reached the threshold. + f'sky autostop -y {name} -i 1', # Should restart the timer. + 'sleep 40', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout), + + # Test restarting the idleness timer via exec: + f'sky start -y {name}', + f'sky status | grep {name} | grep -E "UP\s+-"', + f'sky autostop -y {name} -i 1', # Idleness starts counting. + 'sleep 45', # Almost reached the threshold. + f'sky exec {name} echo hi', # Should restart the timer. + 'sleep 45', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=autostop_timeout + BUMP_UP_SECONDS), + ], + f'sky down -y {name}', + timeout=total_timeout_minutes * 60, + ) + run_one_test(test) + + +# ---------- Testing Autodowning ---------- +@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead. +def test_autodown(generic_cloud: str): + name = get_cluster_name() + # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure + # the VM is terminated. + autodown_timeout = 900 if generic_cloud == 'azure' else 240 + total_timeout_minutes = 90 if generic_cloud == 'azure' else 20 + test = Test( + 'autodown', + [ + f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', + f'sky autostop -y {name} --down -i 1', + # Ensure autostop is set. + f'sky status | grep {name} | grep "1m (down)"', + # Ensure the cluster is not terminated early. + 'sleep 40', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', + # Ensure the cluster is terminated. + f'sleep {autodown_timeout}', + f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', + f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', + f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. + f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml', + f'sky status | grep {name} | grep "1m (down)"', + f'sleep {autodown_timeout}', + # Ensure the cluster is terminated. + f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', + f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', + f'sky autostop -y {name} --cancel', + f'sleep {autodown_timeout}', + # Ensure the cluster is still UP. + f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP', + ], + f'sky down -y {name}', + timeout=total_timeout_minutes * 60, + ) + run_one_test(test) + + +@pytest.mark.scp +def test_scp_autodown(): + name = get_cluster_name() + test = Test( + 'SCP_autodown', + [ + f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', + f'sky autostop -y {name} --down -i 1', + # Ensure autostop is set. + f'sky status | grep {name} | grep "1m (down)"', + # Ensure the cluster is not terminated early. + 'sleep 45', + f'sky status --refresh | grep {name} | grep UP', + # Ensure the cluster is terminated. + 'sleep 200', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', + f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', + f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. + f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', + f'sky status | grep {name} | grep "1m (down)"', + 'sleep 200', + # Ensure the cluster is terminated. + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', + f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', + f'sky autostop -y {name} --cancel', + 'sleep 200', + # Ensure the cluster is still UP. + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP', + ], + f'sky down -y {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): + test = Test( + f'{cloud}-cancel-task', + [ + f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d', + # Wait the GPU process to start. + 'sleep 60', + f'sky exec {name} "nvidia-smi | grep python"', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky cancel -y {name} 1', + 'sleep 60', + # check if the python job is gone. + f'sky exec {name} "! nvidia-smi | grep python"', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=timeout, + ) + return test + + +# ---------- Testing `sky cancel` ---------- +@pytest.mark.aws +def test_cancel_aws(): + name = get_cluster_name() + test = _get_cancel_task_with_cloud(name, 'aws') + run_one_test(test) + + +@pytest.mark.gcp +def test_cancel_gcp(): + name = get_cluster_name() + test = _get_cancel_task_with_cloud(name, 'gcp') + run_one_test(test) + + +@pytest.mark.azure +def test_cancel_azure(): + name = get_cluster_name() + test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now +@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus +@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA +@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet +def test_cancel_pytorch(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'cancel-pytorch', + [ + f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d', + # Wait the GPU process to start. + 'sleep 90', + f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || ' + # When run inside container/k8s, nvidia-smi cannot show process ids. + # See https://github.com/NVIDIA/nvidia-docker/issues/179 + # To work around, we check if GPU utilization is greater than 0. + f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky cancel -y {name} 1', + 'sleep 60', + f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || ' + # Ensure Xorg is the only process running. + '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi` +# requires a CUDA public image, which IBM doesn't offer +@pytest.mark.ibm +def test_cancel_ibm(): + name = get_cluster_name() + test = Test( + 'ibm-cancel-task', + [ + f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml', + f'sky exec {name} -n {name}-1 -d "while true; do echo \'Hello SkyPilot\'; sleep 2; done"', + 'sleep 20', + f'sky queue {name} | grep {name}-1 | grep RUNNING', + f'sky cancel -y {name} 2', + f'sleep 5', + f'sky queue {name} | grep {name}-1 | grep CANCELLED', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Testing use-spot option ---------- +@pytest.mark.no_fluidstack # FluidStack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances +def test_use_spot(generic_cloud: str): + """Test use-spot and sky exec.""" + name = get_cluster_name() + test = Test( + 'use-spot', + [ + f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y', + f'sky logs {name} 1 --status', + f'sky exec {name} echo hi', + f'sky logs {name} 2 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_stop_gcp_spot(): + """Test GCP spot can be stopped, autostopped, restarted.""" + name = get_cluster_name() + test = Test( + 'stop_gcp_spot', + [ + f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', + # stop should go through: + f'sky stop {name} -y', + f'sky start {name} -y', + f'sky exec {name} -- ls myfile', + f'sky logs {name} 2 --status', + f'sky autostop {name} -i0 -y', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=90), + f'sky start {name} -y', + f'sky exec {name} -- ls myfile', + f'sky logs {name} 3 --status', + # -i option at launch should go through: + f'sky launch -c {name} -i0 -y', + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status=ClusterStatus.STOPPED.value, + timeout=120), + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +# ---------- Testing env ---------- +def test_inline_env(generic_cloud: str): + """Test env""" + name = get_cluster_name() + test = Test( + 'test-inline-env', + [ + f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + 'sleep 20', + f'sky logs {name} 1 --status', + f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + f'sky logs {name} 2 --status', + ], + f'sky down -y {name}', + get_timeout(generic_cloud), + ) + run_one_test(test) + + +# ---------- Testing env file ---------- +def test_inline_env_file(generic_cloud: str): + """Test env""" + name = get_cluster_name() + test = Test( + 'test-inline-env-file', + [ + f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + f'sky logs {name} 1 --status', + f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + f'sky logs {name} 2 --status', + ], + f'sky down -y {name}', + get_timeout(generic_cloud), + ) + run_one_test(test) + + +# ---------- Testing custom image ---------- +@pytest.mark.aws +def test_aws_custom_image(): + """Test AWS custom image""" + name = get_cluster_name() + test = Test( + 'test-aws-custom-image', + [ + f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a', # Nvidia image + f'sky logs {name} 1 --status', + ], + f'sky down -y {name}', + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.kubernetes +@pytest.mark.parametrize( + 'image_id', + [ + 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', + 'docker:ubuntu:18.04', + # Test latest image with python 3.11 installed by default. + 'docker:continuumio/miniconda3:24.1.2-0', + # Test python>=3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', + ]) +def test_kubernetes_custom_image(image_id): + """Test Kubernetes custom image""" + name = get_cluster_name() + test = Test( + 'test-kubernetes-custom-image', + [ + f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1', + f'sky logs {name} 1 --status', + # Try exec to run again and check if the logs are printed + f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"', + # Make sure ssh is working with custom username + f'ssh {name} echo hi | grep hi', + ], + f'sky down -y {name}', + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_start_stop_two_nodes(): + name = get_cluster_name() + test = Test( + 'azure-start-stop-two-nodes', + [ + f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml', + f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky stop -y {name}', + f'sky start -y {name} -i 1', + f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=name, + cluster_status= + f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', + timeout=200 + BUMP_UP_SECONDS) + + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' + ], + f'sky down -y {name}', + timeout=30 * 60, # 30 mins (it takes around ~23 mins) + ) + run_one_test(test) + + +# ---------- Testing env for disk tier ---------- +@pytest.mark.aws +def test_aws_disk_tier(): + + def _get_aws_query_command(region, instance_id, field, expected): + return (f'aws ec2 describe-volumes --region {region} ' + f'--filters Name=attachment.instance-id,Values={instance_id} ' + f'--query Volumes[*].{field} | grep {expected} ; ') + + for disk_tier in list(resources_utils.DiskTier): + specs = AWS._get_disk_specs(disk_tier) + name = get_cluster_name() + '-' + disk_tier.value + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.AWS.max_cluster_name_length()) + region = 'us-east-2' + test = Test( + 'aws-disk-tier-' + disk_tier.value, + [ + f'sky launch -y -c {name} --cloud aws --region {region} ' + f'--disk-tier {disk_tier.value} echo "hello sky"', + f'id=`aws ec2 describe-instances --region {region} --filters ' + f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query ' + f'Reservations[].Instances[].InstanceId --output text`; ' + + _get_aws_query_command(region, '$id', 'VolumeType', + specs['disk_tier']) + + ('' if specs['disk_tier'] + == 'standard' else _get_aws_query_command( + region, '$id', 'Iops', specs['disk_iops'])) + + ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command( + region, '$id', 'Throughput', specs['disk_throughput'])), + ], + f'sky down -y {name}', + timeout=10 * 60, # 10 mins (it takes around ~6 mins) + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_disk_tier(): + for disk_tier in list(resources_utils.DiskTier): + disk_types = [GCP._get_disk_type(disk_tier)] + name = get_cluster_name() + '-' + disk_tier.value + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.GCP.max_cluster_name_length()) + region = 'us-west2' + instance_type_options = [''] + if disk_tier == resources_utils.DiskTier.BEST: + # Ultra disk tier requires n2 instance types to have more than 64 CPUs. + # If using default instance type, it will only enable the high disk tier. + disk_types = [ + GCP._get_disk_type(resources_utils.DiskTier.HIGH), + GCP._get_disk_type(resources_utils.DiskTier.ULTRA), + ] + instance_type_options = ['', '--instance-type n2-standard-64'] + for disk_type, instance_type_option in zip(disk_types, + instance_type_options): + test = Test( + 'gcp-disk-tier-' + disk_tier.value, + [ + f'sky launch -y -c {name} --cloud gcp --region {region} ' + f'--disk-tier {disk_tier.value} {instance_type_option} ', + f'name=`gcloud compute instances list --filter=' + f'"labels.ray-cluster-name:{name_on_cloud}" ' + '--format="value(name)"`; ' + f'gcloud compute disks list --filter="name=$name" ' + f'--format="value(type)" | grep {disk_type} ' + ], + f'sky down -y {name}', + timeout=6 * 60, # 6 mins (it takes around ~3 mins) + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_disk_tier(): + for disk_tier in list(resources_utils.DiskTier): + if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + # Azure does not support high and ultra disk tier. + continue + type = Azure._get_disk_type(disk_tier) + name = get_cluster_name() + '-' + disk_tier.value + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.Azure.max_cluster_name_length()) + region = 'westus2' + test = Test( + 'azure-disk-tier-' + disk_tier.value, + [ + f'sky launch -y -c {name} --cloud azure --region {region} ' + f'--disk-tier {disk_tier.value} echo "hello sky"', + f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' + f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' + f'--output tsv | grep {type}' + ], + f'sky down -y {name}', + timeout=20 * 60, # 20 mins (it takes around ~12 mins) + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_best_tier_failover(): + type = Azure._get_disk_type(resources_utils.DiskTier.LOW) + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, sky.Azure.max_cluster_name_length()) + region = 'westus2' + test = Test( + 'azure-best-tier-failover', + [ + f'sky launch -y -c {name} --cloud azure --region {region} ' + f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"', + f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' + f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' + f'--output tsv | grep {type}', + ], + f'sky down -y {name}', + timeout=20 * 60, # 20 mins (it takes around ~12 mins) + ) + run_one_test(test) + + +# ------ Testing Zero Quota Failover ------ +@pytest.mark.aws +def test_aws_zero_quota_failover(): + + name = get_cluster_name() + region = get_aws_region_for_quota_failover() + + if not region: + pytest.xfail( + 'Unable to test zero quota failover optimization — quotas ' + 'for EC2 P3 instances were found on all AWS regions. Is this ' + 'expected for your account?') + return + + test = Test( + 'aws-zero-quota-failover', + [ + f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_zero_quota_failover(): + + name = get_cluster_name() + region = get_gcp_region_for_quota_failover() + + if not region: + pytest.xfail( + 'Unable to test zero quota failover optimization — quotas ' + 'for A100-80GB GPUs were found on all GCP regions. Is this ' + 'expected for your account?') + return + + test = Test( + 'gcp-zero-quota-failover', + [ + f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +def test_long_setup_run_script(generic_cloud: str): + name = get_cluster_name() + with tempfile.NamedTemporaryFile('w', prefix='sky_app_', + suffix='.yaml') as f: + f.write( + textwrap.dedent(""" \ + setup: | + echo "start long setup" + """)) + for i in range(1024 * 200): + f.write(f' echo {i}\n') + f.write(' echo "end long setup"\n') + f.write( + textwrap.dedent(""" \ + run: | + echo "run" + """)) + for i in range(1024 * 200): + f.write(f' echo {i}\n') + f.write(' echo "end run"\n') + f.flush() + + test = Test( + 'long-setup-run-script', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}', + f'sky exec {name} "echo hello"', + f'sky exec {name} {f.name}', + f'sky logs {name} --status 1', + f'sky logs {name} --status 2', + f'sky logs {name} --status 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py index 96ce2f59c0c..e2e4c440b89 100644 --- a/tests/smoke_tests/test_images.py +++ b/tests/smoke_tests/test_images.py @@ -1,34 +1,28 @@ -# Smoke tests for SkyPilot +# Smoke tests for SkyPilot for image functionality # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/test_smoke.py +# > pytest tests/smoke_tests/test_images.py # # Terminate failed clusters after test finishes -# > pytest tests/test_smoke.py --terminate-on-failure +# > pytest tests/smoke_tests/test_images.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/test_smoke.py::test_minimal -# -# Only run managed job tests -# > pytest tests/test_smoke.py --managed-jobs -# -# Only run sky serve tests -# > pytest tests/test_smoke.py --sky-serve +# > pytest tests/smoke_tests/test_images.py::test_aws_images # # Only run test for AWS + generic tests -# > pytest tests/test_smoke.py --aws +# > pytest tests/smoke_tests/test_images.py --aws # # Change cloud for generic tests to aws -# > pytest tests/test_smoke.py --generic-cloud aws +# > pytest tests/smoke_tests/test_images.py --generic-cloud aws import pytest -from smoke_tests.util import _get_cluster_name from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS +from smoke_tests.util import get_cluster_name from smoke_tests.util import run_one_test from smoke_tests.util import Test @@ -38,7 +32,7 @@ # ---------- Test the image ---------- @pytest.mark.aws def test_aws_images(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_images', [ @@ -58,7 +52,7 @@ def test_aws_images(): @pytest.mark.gcp def test_gcp_images(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_images', [ @@ -78,7 +72,7 @@ def test_gcp_images(): @pytest.mark.azure def test_azure_images(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'azure_images', [ @@ -98,7 +92,7 @@ def test_azure_images(): @pytest.mark.aws def test_aws_image_id_dict(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_image_id_dict', [ @@ -117,7 +111,7 @@ def test_aws_image_id_dict(): @pytest.mark.gcp def test_gcp_image_id_dict(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_image_id_dict', [ @@ -136,7 +130,7 @@ def test_gcp_image_id_dict(): @pytest.mark.aws def test_aws_image_id_dict_region(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_image_id_dict_region', [ @@ -173,7 +167,7 @@ def test_aws_image_id_dict_region(): @pytest.mark.gcp def test_gcp_image_id_dict_region(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_image_id_dict_region', [ @@ -206,7 +200,7 @@ def test_gcp_image_id_dict_region(): @pytest.mark.aws def test_aws_image_id_dict_zone(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_image_id_dict_zone', [ @@ -244,7 +238,7 @@ def test_aws_image_id_dict_zone(): @pytest.mark.gcp def test_gcp_image_id_dict_zone(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_image_id_dict_zone', [ @@ -278,7 +272,7 @@ def test_gcp_image_id_dict_zone(): @pytest.mark.aws def test_clone_disk_aws(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'clone_disk_aws', [ @@ -305,7 +299,7 @@ def test_clone_disk_aws(): @pytest.mark.gcp def test_clone_disk_gcp(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'clone_disk_gcp', [ @@ -324,7 +318,7 @@ def test_clone_disk_gcp(): @pytest.mark.gcp def test_gcp_mig(): - name = _get_cluster_name() + name = get_cluster_name() region = 'us-central1' test = Test( 'gcp_mig', @@ -354,7 +348,7 @@ def test_gcp_mig(): @pytest.mark.gcp def test_gcp_force_enable_external_ips(): - name = _get_cluster_name() + name = get_cluster_name() test_commands = [ f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', # Check network of vm is "default" @@ -376,7 +370,7 @@ def test_gcp_force_enable_external_ips(): @pytest.mark.aws def test_image_no_conda(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'image_no_conda', [ @@ -396,7 +390,7 @@ def test_image_no_conda(): @pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation @pytest.mark.no_kubernetes # Kubernetes does not support stopping instances def test_custom_default_conda_env(generic_cloud: str): - name = _get_cluster_name() + name = get_cluster_name() test = Test('custom_default_conda_env', [ f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky status -r {name} | grep "UP"', diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py new file mode 100644 index 00000000000..521b08797f5 --- /dev/null +++ b/tests/smoke_tests/test_managed_job.py @@ -0,0 +1,766 @@ +# Smoke tests for SkyPilot for managed jobs +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_managed_job.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_managed_job.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs +# +# Only run managed job tests +# > pytest tests/smoke_tests/test_managed_job.py --managed-jobs +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_managed_job.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_managed_job.py --generic-cloud aws + +import pathlib +import tempfile +import time + +import pytest +from smoke_tests.util import _BUMP_UP_SECONDS +from smoke_tests.util import get_cluster_name +from smoke_tests.util import GET_JOB_QUEUE +from smoke_tests.util import JOB_WAIT_NOT_RUNNING +from smoke_tests.util import run_one_test +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test +from smoke_tests.util import TestStorageWithCredentials +from smoke_tests.util import ( + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) + +from sky import jobs +from sky.data import storage as storage_lib +from sky.jobs.state import ManagedJobStatus +from sky.skylet import constants +from sky.utils import common_utils + + +# ---------- Testing managed job ---------- +# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues +# when the controller being on Azure, which takes a long time for launching +# step. +@pytest.mark.managed_jobs +def test_managed_jobs(generic_cloud: str): + """Test the managed jobs yaml.""" + name = get_cluster_name() + test = Test( + 'managed-jobs', + [ + f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', + f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-1', + job_status= + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + timeout=60), + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status= + f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + timeout=60), + f'sky jobs cancel -y -n {name}-1', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-1', + job_status=f'{ManagedJobStatus.CANCELLED.value}', + timeout=230), + # Test the functionality for logging. + f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', + f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', + f'{GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', + ], + # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when + # canceling multiple job names is supported. + f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack #fluidstack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances +@pytest.mark.managed_jobs +def test_job_pipeline(generic_cloud: str): + """Test a job pipeline.""" + name = get_cluster_name() + test = Test( + 'spot-pipeline', + [ + f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d', + 'sleep 5', + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', + # `grep -A 4 {name}` finds the job with {name} and the 4 lines + # after it, i.e. the 4 tasks within the job. + # `sed -n 2p` gets the second line of the 4 lines, i.e. the first + # task within the job. + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', + f'sky jobs cancel -y -n {name}', + 'sleep 5', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', + 'sleep 200', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + ], + f'sky jobs cancel -y -n {name}', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack #fluidstack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances +@pytest.mark.managed_jobs +def test_managed_jobs_failed_setup(generic_cloud: str): + """Test managed job with failed setup.""" + name = get_cluster_name() + test = Test( + 'managed_jobs_failed_setup', + [ + f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', + # Make sure the job failed quickly. + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', + timeout=330 + _BUMP_UP_SECONDS), + ], + f'sky jobs cancel -y -n {name}', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack #fluidstack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances +@pytest.mark.managed_jobs +def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): + """Test managed job with failed setup for a pipeline.""" + name = get_cluster_name() + test = Test( + 'managed_jobs_pipeline_failed_setup', + [ + f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', + timeout=600), + # Make sure the job failed quickly. + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', + # Task 0 should be SUCCEEDED. + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', + # Task 1 should be FAILED_SETUP. + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', + # Task 2 should be CANCELLED. + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + # Task 3 should be CANCELLED. + f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + ], + f'sky jobs cancel -y -n {name}', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=30 * 60, + ) + run_one_test(test) + + +# ---------- Testing managed job recovery ---------- + + +@pytest.mark.aws +@pytest.mark.managed_jobs +def test_managed_jobs_recovery_aws(aws_config_region): + """Test managed job recovery.""" + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + region = aws_config_region + test = Test( + 'managed_jobs_recovery_aws', + [ + f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=600), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + # Terminate the cluster manually. + (f'aws ec2 terminate-instances --region {region} --instance-ids $(' + f'aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' + f'--query Reservations[].Instances[].InstanceId ' + '--output text)'), + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.managed_jobs +def test_managed_jobs_recovery_gcp(): + """Test managed job recovery.""" + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + zone = 'us-east4-b' + query_cmd = ( + f'gcloud compute instances list --filter=' + # `:` means prefix match. + f'"(labels.ray-cluster-name:{name_on_cloud})" ' + f'--zones={zone} --format="value(name)"') + terminate_cmd = (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + test = Test( + 'managed_jobs_recovery_gcp', + [ + f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=300), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + # Terminate the cluster manually. + terminate_cmd, + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.aws +@pytest.mark.managed_jobs +def test_managed_jobs_pipeline_recovery_aws(aws_config_region): + """Test managed job recovery for a pipeline.""" + name = get_cluster_name() + user_hash = common_utils.get_user_hash() + user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] + region = aws_config_region + if region != 'us-east-2': + pytest.skip('Only run spot pipeline recovery test in us-east-2') + test = Test( + 'managed_jobs_pipeline_recovery_aws', + [ + f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', + # Terminate the cluster manually. + # The `cat ...| rev` is to retrieve the job_id from the + # SKYPILOT_TASK_ID, which gets the second to last field + # separated by `-`. + ( + f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' + 'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;' + f'aws ec2 terminate-instances --region {region} --instance-ids $(' + f'aws ec2 describe-instances --region {region} ' + # TODO(zhwu): fix the name for spot cluster. + '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}' + f'-{user_hash} ' + f'--query Reservations[].Instances[].InstanceId ' + '--output text)'), + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', + f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', + f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', + f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.managed_jobs +def test_managed_jobs_pipeline_recovery_gcp(): + """Test managed job recovery for a pipeline.""" + name = get_cluster_name() + zone = 'us-east4-b' + user_hash = common_utils.get_user_hash() + user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] + query_cmd = ( + 'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" ' + f'--zones={zone} --format="value(name)"') + terminate_cmd = (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + test = Test( + 'managed_jobs_pipeline_recovery_gcp', + [ + f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', + # Terminate the cluster manually. + # The `cat ...| rev` is to retrieve the job_id from the + # SKYPILOT_TASK_ID, which gets the second to last field + # separated by `-`. + (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' + f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=200), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', + f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', + f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', + f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances +@pytest.mark.managed_jobs +def test_managed_jobs_recovery_default_resources(generic_cloud: str): + """Test managed job recovery for default resources.""" + name = get_cluster_name() + test = Test( + 'managed-spot-recovery-default-resources', + [ + f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', + timeout=360), + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.aws +@pytest.mark.managed_jobs +def test_managed_jobs_recovery_multi_node_aws(aws_config_region): + """Test managed job recovery.""" + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + region = aws_config_region + test = Test( + 'managed_jobs_recovery_multi_node_aws', + [ + f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=450), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + # Terminate the worker manually. + (f'aws ec2 terminate-instances --region {region} --instance-ids $(' + f'aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' + 'Name=tag:ray-node-type,Values=worker ' + f'--query Reservations[].Instances[].InstanceId ' + '--output text)'), + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', + ], + f'sky jobs cancel -y -n {name}', + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.managed_jobs +def test_managed_jobs_recovery_multi_node_gcp(): + """Test managed job recovery.""" + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + zone = 'us-west2-a' + # Use ':' to match as the cluster name will contain the suffix with job id + query_cmd = ( + f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{name_on_cloud} AND ' + f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"') + terminate_cmd = (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + test = Test( + 'managed_jobs_recovery_multi_node_gcp', + [ + f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=400), + f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', + # Terminate the worker manually. + terminate_cmd, + JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.RUNNING.value, + timeout=560), + f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', + ], + f'sky jobs cancel -y -n {name}', + timeout=25 * 60, + ) + run_one_test(test) + + +@pytest.mark.aws +@pytest.mark.managed_jobs +def test_managed_jobs_cancellation_aws(aws_config_region): + name = get_cluster_name() + name_on_cloud = common_utils.make_cluster_name_on_cloud( + name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + name_2_on_cloud = common_utils.make_cluster_name_on_cloud( + f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + name_3_on_cloud = common_utils.make_cluster_name_on_cloud( + f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + region = aws_config_region + test = Test( + 'managed_jobs_cancellation_aws', + [ + # Test cancellation during spot cluster being launched. + f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', + timeout=60 + _BUMP_UP_SECONDS), + f'sky jobs cancel -y -n {name}', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + (f's=$(aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' + f'--query Reservations[].Instances[].State[].Name ' + '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' + ), + # Test cancelling the spot cluster during spot job being setup. + f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', + # The job is set up in the cluster, will shown as RUNNING. + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), + f'sky jobs cancel -y -n {name}-2', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + (f's=$(aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' + f'--query Reservations[].Instances[].State[].Name ' + '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' + ), + # Test cancellation during spot job is recovering. + f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', + # The job is running in the cluster, will shown as RUNNING. + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), + # Terminate the cluster manually. + (f'aws ec2 terminate-instances --region {region} --instance-ids $(' + f'aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' + f'--query Reservations[].Instances[].InstanceId ' + '--output text)'), + JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + f'sky jobs cancel -y -n {name}-3', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because + # there can be multiple VM with the same name due to the recovery. + (f's=$(aws ec2 describe-instances --region {region} ' + f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' + f'--query Reservations[].Instances[].State[].Name ' + '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"' + ), + ], + timeout=25 * 60) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.managed_jobs +def test_managed_jobs_cancellation_gcp(): + name = get_cluster_name() + name_3 = f'{name}-3' + name_3_on_cloud = common_utils.make_cluster_name_on_cloud( + name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) + zone = 'us-west3-b' + query_state_cmd = ( + 'gcloud compute instances list ' + f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" ' + '--format="value(status)"') + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{name_3_on_cloud})" ' + f'--zones={zone} --format="value(name)"') + terminate_cmd = (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + test = Test( + 'managed_jobs_cancellation_gcp', + [ + # Test cancellation during spot cluster being launched. + f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), + f'sky jobs cancel -y -n {name}', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + # Test cancelling the spot cluster during spot job being setup. + f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', + # The job is set up in the cluster, will shown as RUNNING. + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), + f'sky jobs cancel -y -n {name}-2', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-2', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + # Test cancellation during spot job is recovering. + f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.RUNNING.value, + timeout=300 + _BUMP_UP_SECONDS), + # Terminate the cluster manually. + terminate_cmd, + JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + f'sky jobs cancel -y -n {name}-3', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=f'{name}-3', + job_status=ManagedJobStatus.CANCELLED.value, + timeout=120 + _BUMP_UP_SECONDS), + # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because + # there can be multiple VM with the same name due to the recovery. + (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' + ), + ], + timeout=25 * 60) + run_one_test(test) + + +# ---------- Testing storage for managed job ---------- +@pytest.mark.no_fluidstack # Fluidstack does not support spot instances +@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances +@pytest.mark.no_ibm # IBM Cloud does not support spot instances +@pytest.mark.no_paperspace # Paperspace does not support spot instances +@pytest.mark.no_scp # SCP does not support spot instances +@pytest.mark.managed_jobs +def test_managed_jobs_storage(generic_cloud: str): + """Test storage with managed job""" + name = get_cluster_name() + yaml_str = pathlib.Path( + 'examples/managed_job_with_storage.yaml').read_text() + timestamp = int(time.time()) + storage_name = f'sky-test-{timestamp}' + output_storage_name = f'sky-test-output-{timestamp}' + + # Also perform region testing for bucket creation to validate if buckets are + # created in the correct region and correctly mounted in managed jobs. + # However, we inject this testing only for AWS and GCP since they are the + # supported object storage providers in SkyPilot. + region_flag = '' + region_validation_cmd = 'true' + use_spot = ' --use-spot' + if generic_cloud == 'aws': + region = 'eu-central-1' + region_flag = f' --region {region}' + region_cmd = TestStorageWithCredentials.cli_region_cmd( + storage_lib.StoreType.S3, bucket_name=storage_name) + region_validation_cmd = f'{region_cmd} | grep {region}' + s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.S3, output_storage_name, 'output.txt') + output_check_cmd = f'{s3_check_file_count} | grep 1' + elif generic_cloud == 'gcp': + region = 'us-west2' + region_flag = f' --region {region}' + region_cmd = TestStorageWithCredentials.cli_region_cmd( + storage_lib.StoreType.GCS, bucket_name=storage_name) + region_validation_cmd = f'{region_cmd} | grep {region}' + gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.GCS, output_storage_name, 'output.txt') + output_check_cmd = f'{gcs_check_file_count} | grep 1' + elif generic_cloud == 'azure': + region = 'westus2' + region_flag = f' --region {region}' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name(region)) + region_cmd = TestStorageWithCredentials.cli_region_cmd( + storage_lib.StoreType.AZURE, + storage_account_name=storage_account_name) + region_validation_cmd = f'{region_cmd} | grep {region}' + az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.AZURE, + output_storage_name, + 'output.txt', + storage_account_name=storage_account_name) + output_check_cmd = f'{az_check_file_count} | grep 1' + elif generic_cloud == 'kubernetes': + # With Kubernetes, we don't know which object storage provider is used. + # Check both S3 and GCS if bucket exists in either. + s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.S3, output_storage_name, 'output.txt') + s3_output_check_cmd = f'{s3_check_file_count} | grep 1' + gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( + storage_lib.StoreType.GCS, output_storage_name, 'output.txt') + gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1' + output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}' + use_spot = ' --no-use-spot' + + yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) + yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(yaml_str) + f.flush() + file_path = f.name + test = Test( + 'managed_jobs_storage', + [ + *STORAGE_SETUP_COMMANDS, + f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', + region_validation_cmd, # Check if the bucket is created in the correct region + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=60 + _BUMP_UP_SECONDS), + f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', + # Check if file was written to the mounted output bucket + output_check_cmd + ], + (f'sky jobs cancel -y -n {name}', + f'; sky storage delete {output_storage_name} || true'), + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) + + +# ---------- Testing spot TPU ---------- +@pytest.mark.gcp +@pytest.mark.managed_jobs +@pytest.mark.tpu +def test_managed_jobs_tpu(): + """Test managed job on TPU.""" + name = get_cluster_name() + test = Test( + 'test-spot-tpu', + [ + f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.STARTING.value, + timeout=60 + _BUMP_UP_SECONDS), + # TPU takes a while to launch + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status= + f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', + timeout=900 + _BUMP_UP_SECONDS), + ], + f'sky jobs cancel -y -n {name}', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) + + +# ---------- Testing env for managed jobs ---------- +@pytest.mark.managed_jobs +def test_managed_jobs_inline_env(generic_cloud: str): + """Test managed jobs env""" + name = get_cluster_name() + test = Test( + 'test-managed-jobs-inline-env', + [ + f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=name, + job_status=ManagedJobStatus.SUCCEEDED.value, + timeout=20 + _BUMP_UP_SECONDS), + ], + f'sky jobs cancel -y -n {name}', + # Increase timeout since sky jobs queue -r can be blocked by other spot tests. + timeout=20 * 60, + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py new file mode 100644 index 00000000000..95952d3b432 --- /dev/null +++ b/tests/smoke_tests/test_mount_and_storage.py @@ -0,0 +1,1503 @@ +# Smoke tests for SkyPilot for mounting storage +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_mount_and_storage.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_mount_and_storage.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_mount_and_storage.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_mount_and_storage.py --generic-cloud aws + +import os +import pathlib +import shlex +import shutil +import subprocess +import tempfile +import time +from typing import Dict, Optional +import urllib.parse +import uuid + +import jinja2 +import pytest +from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_timeout +from smoke_tests.util import run_one_test +from smoke_tests.util import SCP_TYPE +from smoke_tests.util import STORAGE_SETUP_COMMANDS +from smoke_tests.util import Test +from smoke_tests.util import TestStorageWithCredentials + +import sky +from sky import global_user_state +from sky import skypilot_config +from sky.adaptors import cloudflare +from sky.adaptors import ibm +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.data.data_utils import Rclone + + +# ---------- file_mounts ---------- +@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. +def test_file_mounts(generic_cloud: str): + name = get_cluster_name() + extra_flags = '' + if generic_cloud in 'kubernetes': + # Kubernetes does not support multi-node + # NOTE: This test will fail if you have a Kubernetes cluster running on + # arm64 (e.g., Apple Silicon) since goofys does not work on arm64. + extra_flags = '--num-nodes 1' + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ] + test = Test( + 'using_file_mounts', + test_commands, + f'sky down -y {name}', + get_timeout(generic_cloud, 20 * 60), # 20 mins + ) + run_one_test(test) + + +@pytest.mark.scp +def test_scp_file_mounts(): + name = get_cluster_name() + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ] + test = Test( + 'SCP_using_file_mounts', + test_commands, + f'sky down -y {name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Requires GCP to be enabled +def test_using_file_mounts_with_env_vars(generic_cloud: str): + name = get_cluster_name() + storage_name = TestStorageWithCredentials.generate_bucket_name() + test_commands = [ + *STORAGE_SETUP_COMMANDS, + (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} ' + 'examples/using_file_mounts_with_env_vars.yaml ' + f'--env MY_BUCKET={storage_name}'), + f'sky logs {name} 1 --status', # Ensure the job succeeded. + # Override with --env: + (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} ' + 'examples/using_file_mounts_with_env_vars.yaml ' + f'--env MY_BUCKET={storage_name} ' + '--env MY_LOCAL_PATH=tmpfile'), + f'sky logs {name}-2 1 --status', # Ensure the job succeeded. + ] + test = Test( + 'using_file_mounts_with_env_vars', + test_commands, + (f'sky down -y {name} {name}-2', + f'sky storage delete -y {storage_name} {storage_name}-2'), + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +# ---------- storage ---------- +@pytest.mark.aws +def test_aws_storage_mounts_with_stop(): + name = get_cluster_name() + cloud = 'aws' + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name, cloud=cloud) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {cloud} {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'aws s3 ls {storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'aws_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_storage_mounts_with_stop(): + name = get_cluster_name() + cloud = 'gcp' + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name, cloud=cloud) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {cloud} {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'gsutil ls gs://{storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'gcp_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.azure +def test_azure_storage_mounts_with_stop(): + name = get_cluster_name() + cloud = 'azure' + storage_name = f'sky-test-{int(time.time())}' + default_region = 'eastus' + storage_account_name = (storage_lib.AzureBlobStore. + get_default_storage_account_name(default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name, cloud=cloud) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {cloud} {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)' + # if the file does not exist, az storage blob list returns '[]' + f'[ "$output" = "[]" ] && exit 1;' + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'azure_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_kubernetes_storage_mounts(): + # Tests bucket mounting on k8s, assuming S3 is configured. + # This test will fail if run on non x86_64 architecture, since goofys is + # built for x86_64 only. + name = get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud kubernetes {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'aws s3 ls {storage_name}/hello.txt || ' + f'gsutil ls gs://{storage_name}/hello.txt', + ] + test = Test( + 'kubernetes_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_kubernetes_context_switch(): + name = get_cluster_name() + new_context = f'sky-test-context-{int(time.time())}' + new_namespace = f'sky-test-namespace-{int(time.time())}' + + test_commands = [ + # Launch a cluster and run a simple task + f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"', + f'sky logs {name} 1 --status', # Ensure job succeeded + + # Get current context details and save to a file for later use in cleanup + 'CURRENT_CONTEXT=$(kubectl config current-context); ' + 'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; ' + 'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); ' + 'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); ' + + # Create a new context with a different name and namespace + f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}', + + # Create the new namespace if it doesn't exist + f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -', + + # Set the new context as active + f'kubectl config use-context {new_context}', + + # Verify the new context is active + f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1', + + # Try to run sky exec on the original cluster (should still work) + f'sky exec {name} "echo Success: sky exec works after context switch"', + + # Test sky queue + f'sky queue {name}', + + # Test SSH access + f'ssh {name} whoami', + ] + + cleanup_commands = ( + f'kubectl delete namespace {new_namespace}; ' + f'kubectl config delete-context {new_context}; ' + 'kubectl config use-context $(cat /tmp/sky_test_current_context); ' + 'rm /tmp/sky_test_current_context; ' + f'sky down -y {name}') + + test = Test( + 'kubernetes_context_switch', + test_commands, + cleanup_commands, + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.parametrize( + 'image_id', + [ + 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', + 'docker:ubuntu:18.04', + # Test image with python 3.11 installed by default. + 'docker:continuumio/miniconda3:24.1.2-0', + # Test python>=3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', + ]) +def test_docker_storage_mounts(generic_cloud: str, image_id: str): + # Tests bucket mounting on docker container + name = get_cluster_name() + timestamp = str(time.time()).replace('.', '') + storage_name = f'sky-test-{timestamp}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3. + azure_mount_unsupported_ubuntu_version = '18.04' + # Commands to verify bucket upload. We need to check all three + # storage types because the optimizer may pick any of them. + s3_command = f'aws s3 ls {storage_name}/hello.txt' + gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt' + azure_blob_command = TestStorageWithCredentials.cli_ls_cmd( + storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt') + if azure_mount_unsupported_ubuntu_version in image_id: + # The store for mount_private_mount is not specified in the template. + # If we're running on Azure, the private mount will be created on + # azure blob. That will not be supported on the ubuntu 18.04 image + # and thus fail. For other clouds, the private mount on other + # storage types (GCS/S3) should succeed. + include_private_mount = False if generic_cloud == 'azure' else True + content = template.render(storage_name=storage_name, + include_azure_mount=False, + include_private_mount=include_private_mount) + else: + content = template.render(storage_name=storage_name,) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + # Check AWS, GCP, or Azure storage mount. + f'{s3_command} || ' + f'{gsutil_command} || ' + f'{azure_blob_command}', + ] + test = Test( + 'docker_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.cloudflare +def test_cloudflare_storage_mounts(generic_cloud: str): + name = get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_r2_storage_mounting.yaml').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + endpoint_url = cloudflare.create_endpoint() + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2' + ] + + test = Test( + 'cloudflare_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.ibm +def test_ibm_storage_mounts(): + name = get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + storage_name, Rclone.RcloneClouds.IBM) + template_str = pathlib.Path( + 'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} --cloud ibm {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt', + ] + test = Test( + 'ibm_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +# ---------- Testing Storage ---------- +class TestStorageWithCredentials: + """Storage tests which require credentials and network connection""" + + AWS_INVALID_NAMES = [ + 'ab', # less than 3 characters + 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', + # more than 63 characters + 'Abcdef', # contains an uppercase letter + 'abc def', # contains a space + 'abc..def', # two adjacent periods + '192.168.5.4', # formatted as an IP address + 'xn--bucket', # starts with 'xn--' prefix + 'bucket-s3alias', # ends with '-s3alias' suffix + 'bucket--ol-s3', # ends with '--ol-s3' suffix + '.abc', # starts with a dot + 'abc.', # ends with a dot + '-abc', # starts with a hyphen + 'abc-', # ends with a hyphen + ] + + GCS_INVALID_NAMES = [ + 'ab', # less than 3 characters + 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', + # more than 63 characters (without dots) + 'Abcdef', # contains an uppercase letter + 'abc def', # contains a space + 'abc..def', # two adjacent periods + 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1' + # More than 63 characters between dots + 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5, + # more than 222 characters (with dots) + '192.168.5.4', # formatted as an IP address + 'googbucket', # starts with 'goog' prefix + 'googlebucket', # contains 'google' + 'g00glebucket', # variant of 'google' + 'go0glebucket', # variant of 'google' + 'g0oglebucket', # variant of 'google' + '.abc', # starts with a dot + 'abc.', # ends with a dot + '_abc', # starts with an underscore + 'abc_', # ends with an underscore + ] + + AZURE_INVALID_NAMES = [ + 'ab', # less than 3 characters + # more than 63 characters + 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', + 'Abcdef', # contains an uppercase letter + '.abc', # starts with a non-letter(dot) + 'a--bc', # contains consecutive hyphens + ] + + IBM_INVALID_NAMES = [ + 'ab', # less than 3 characters + 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', + # more than 63 characters + 'Abcdef', # contains an uppercase letter + 'abc def', # contains a space + 'abc..def', # two adjacent periods + '192.168.5.4', # formatted as an IP address + 'xn--bucket', # starts with 'xn--' prefix + '.abc', # starts with a dot + 'abc.', # ends with a dot + '-abc', # starts with a hyphen + 'abc-', # ends with a hyphen + 'a.-bc', # contains the sequence '.-' + 'a-.bc', # contains the sequence '-.' + 'a&bc' # contains special characters + 'ab^c' # contains special characters + ] + GITIGNORE_SYNC_TEST_DIR_STRUCTURE = { + 'double_asterisk': { + 'double_asterisk_excluded': None, + 'double_asterisk_excluded_dir': { + 'dir_excluded': None, + }, + }, + 'double_asterisk_parent': { + 'parent': { + 'also_excluded.txt': None, + 'child': { + 'double_asterisk_parent_child_excluded.txt': None, + }, + 'double_asterisk_parent_excluded.txt': None, + }, + }, + 'excluded.log': None, + 'excluded_dir': { + 'excluded.txt': None, + 'nested_excluded': { + 'excluded': None, + }, + }, + 'exp-1': { + 'be_excluded': None, + }, + 'exp-2': { + 'be_excluded': None, + }, + 'front_slash_excluded': None, + 'included.log': None, + 'included.txt': None, + 'include_dir': { + 'excluded.log': None, + 'included.log': None, + }, + 'nested_double_asterisk': { + 'one': { + 'also_exclude.txt': None, + }, + 'two': { + 'also_exclude.txt': None, + }, + }, + 'nested_wildcard_dir': { + 'monday': { + 'also_exclude.txt': None, + }, + 'tuesday': { + 'also_exclude.txt': None, + }, + }, + 'no_slash_excluded': None, + 'no_slash_tests': { + 'no_slash_excluded': { + 'also_excluded.txt': None, + }, + }, + 'question_mark': { + 'excluded1.txt': None, + 'excluded@.txt': None, + }, + 'square_bracket': { + 'excluded1.txt': None, + }, + 'square_bracket_alpha': { + 'excludedz.txt': None, + }, + 'square_bracket_excla': { + 'excluded2.txt': None, + 'excluded@.txt': None, + }, + 'square_bracket_single': { + 'excluded0.txt': None, + }, + } + + @staticmethod + def create_dir_structure(base_path, structure): + # creates a given file STRUCTURE in BASE_PATH + for name, substructure in structure.items(): + path = os.path.join(base_path, name) + if substructure is None: + # Create a file + open(path, 'a', encoding='utf-8').close() + else: + # Create a subdirectory + os.mkdir(path) + TestStorageWithCredentials.create_dir_structure( + path, substructure) + + @staticmethod + def cli_delete_cmd(store_type, + bucket_name, + storage_account_name: str = None): + if store_type == storage_lib.StoreType.S3: + url = f's3://{bucket_name}' + return f'aws s3 rb {url} --force' + if store_type == storage_lib.StoreType.GCS: + url = f'gs://{bucket_name}' + gsutil_alias, alias_gen = data_utils.get_gsutil_command() + return f'{alias_gen}; {gsutil_alias} rm -r {url}' + if store_type == storage_lib.StoreType.AZURE: + default_region = 'eastus' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + return ('az storage container delete ' + f'--account-name {storage_account_name} ' + f'--account-key {storage_account_key} ' + f'--name {bucket_name}') + if store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + url = f's3://{bucket_name}' + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2' + if store_type == storage_lib.StoreType.IBM: + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + bucket_name, Rclone.RcloneClouds.IBM) + return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}' + + @staticmethod + def cli_ls_cmd(store_type, bucket_name, suffix=''): + if store_type == storage_lib.StoreType.S3: + if suffix: + url = f's3://{bucket_name}/{suffix}' + else: + url = f's3://{bucket_name}' + return f'aws s3 ls {url}' + if store_type == storage_lib.StoreType.GCS: + if suffix: + url = f'gs://{bucket_name}/{suffix}' + else: + url = f'gs://{bucket_name}' + return f'gsutil ls {url}' + if store_type == storage_lib.StoreType.AZURE: + default_region = 'eastus' + config_storage_account = skypilot_config.get_nested( + ('azure', 'storage_account'), None) + storage_account_name = config_storage_account if ( + config_storage_account is not None) else ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + list_cmd = ('az storage blob list ' + f'--container-name {bucket_name} ' + f'--prefix {shlex.quote(suffix)} ' + f'--account-name {storage_account_name} ' + f'--account-key {storage_account_key}') + return list_cmd + if store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + if suffix: + url = f's3://{bucket_name}/{suffix}' + else: + url = f's3://{bucket_name}' + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' + if store_type == storage_lib.StoreType.IBM: + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + bucket_name, Rclone.RcloneClouds.IBM) + return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}' + + @staticmethod + def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None): + if store_type == storage_lib.StoreType.S3: + assert bucket_name is not None + return ('aws s3api get-bucket-location ' + f'--bucket {bucket_name} --output text') + elif store_type == storage_lib.StoreType.GCS: + assert bucket_name is not None + return (f'gsutil ls -L -b gs://{bucket_name}/ | ' + 'grep "Location constraint" | ' + 'awk \'{print tolower($NF)}\'') + elif store_type == storage_lib.StoreType.AZURE: + # For Azure Blob Storage, the location of the containers are + # determined by the location of storage accounts. + assert storage_account_name is not None + return (f'az storage account show --name {storage_account_name} ' + '--query "primaryLocation" --output tsv') + else: + raise NotImplementedError(f'Region command not implemented for ' + f'{store_type}') + + @staticmethod + def cli_count_name_in_bucket(store_type, + bucket_name, + file_name, + suffix='', + storage_account_name=None): + if store_type == storage_lib.StoreType.S3: + if suffix: + return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' + else: + return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' + elif store_type == storage_lib.StoreType.GCS: + if suffix: + return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l' + else: + return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l' + elif store_type == storage_lib.StoreType.AZURE: + if storage_account_name is None: + default_region = 'eastus' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + return ('az storage blob list ' + f'--container-name {bucket_name} ' + f'--prefix {shlex.quote(suffix)} ' + f'--account-name {storage_account_name} ' + f'--account-key {storage_account_key} | ' + f'grep {file_name} | ' + 'wc -l') + elif store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + if suffix: + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' + else: + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' + + @staticmethod + def cli_count_file_in_bucket(store_type, bucket_name): + if store_type == storage_lib.StoreType.S3: + return f'aws s3 ls s3://{bucket_name} --recursive | wc -l' + elif store_type == storage_lib.StoreType.GCS: + return f'gsutil ls -r gs://{bucket_name}/** | wc -l' + elif store_type == storage_lib.StoreType.AZURE: + default_region = 'eastus' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + return ('az storage blob list ' + f'--container-name {bucket_name} ' + f'--account-name {storage_account_name} ' + f'--account-key {storage_account_key} | ' + 'grep \\"name\\": | ' + 'wc -l') + elif store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l' + + @pytest.fixture + def tmp_source(self, tmp_path): + # Creates a temporary directory with a file in it + tmp_dir = tmp_path / 'tmp-source' + tmp_dir.mkdir() + tmp_file = tmp_dir / 'tmp-file' + tmp_file.write_text('test') + circle_link = tmp_dir / 'circle-link' + circle_link.symlink_to(tmp_dir, target_is_directory=True) + yield str(tmp_dir) + + @staticmethod + def generate_bucket_name(): + # Creates a temporary bucket name + # time.time() returns varying precision on different systems, so we + # replace the decimal point and use whatever precision we can get. + timestamp = str(time.time()).replace('.', '') + return f'sky-test-{timestamp}' + + @pytest.fixture + def tmp_bucket_name(self): + yield self.generate_bucket_name() + + @staticmethod + def yield_storage_object( + name: Optional[str] = None, + source: Optional[storage_lib.Path] = None, + stores: Optional[Dict[storage_lib.StoreType, + storage_lib.AbstractStore]] = None, + persistent: Optional[bool] = True, + mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT): + # Creates a temporary storage object. Stores must be added in the test. + storage_obj = storage_lib.Storage(name=name, + source=source, + stores=stores, + persistent=persistent, + mode=mode) + yield storage_obj + handle = global_user_state.get_handle_from_storage_name( + storage_obj.name) + if handle: + # If handle exists, delete manually + # TODO(romilb): This is potentially risky - if the delete method has + # bugs, this can cause resource leaks. Ideally we should manually + # eject storage from global_user_state and delete the bucket using + # boto3 directly. + storage_obj.delete() + + @pytest.fixture + def tmp_scratch_storage_obj(self, tmp_bucket_name): + # Creates a storage object with no source to create a scratch storage. + # Stores must be added in the test. + yield from self.yield_storage_object(name=tmp_bucket_name) + + @pytest.fixture + def tmp_multiple_scratch_storage_obj(self): + # Creates a list of 5 storage objects with no source to create + # multiple scratch storages. + # Stores for each object in the list must be added in the test. + storage_mult_obj = [] + for _ in range(5): + timestamp = str(time.time()).replace('.', '') + store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}') + storage_mult_obj.append(store_obj) + yield storage_mult_obj + for storage_obj in storage_mult_obj: + handle = global_user_state.get_handle_from_storage_name( + storage_obj.name) + if handle: + # If handle exists, delete manually + # TODO(romilb): This is potentially risky - if the delete method has + # bugs, this can cause resource leaks. Ideally we should manually + # eject storage from global_user_state and delete the bucket using + # boto3 directly. + storage_obj.delete() + + @pytest.fixture + def tmp_multiple_custom_source_storage_obj(self): + # Creates a list of storage objects with custom source names to + # create multiple scratch storages. + # Stores for each object in the list must be added in the test. + custom_source_names = ['"path With Spaces"', 'path With Spaces'] + storage_mult_obj = [] + for name in custom_source_names: + src_path = os.path.expanduser(f'~/{name}') + pathlib.Path(src_path).expanduser().mkdir(exist_ok=True) + timestamp = str(time.time()).replace('.', '') + store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}', + source=src_path) + storage_mult_obj.append(store_obj) + yield storage_mult_obj + for storage_obj in storage_mult_obj: + handle = global_user_state.get_handle_from_storage_name( + storage_obj.name) + if handle: + storage_obj.delete() + + @pytest.fixture + def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source): + # Creates a temporary storage object. Stores must be added in the test. + yield from self.yield_storage_object(name=tmp_bucket_name, + source=tmp_source) + + @pytest.fixture + def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source): + # Creates a temp storage object which uses a list of paths as source. + # Stores must be added in the test. After upload, the bucket should + # have two files - /tmp-file and /tmp-source/tmp-file + list_source = [tmp_source, tmp_source + '/tmp-file'] + yield from self.yield_storage_object(name=tmp_bucket_name, + source=list_source) + + @pytest.fixture + def tmp_bulk_del_storage_obj(self, tmp_bucket_name): + # Creates a temporary storage object for testing bulk deletion. + # Stores must be added in the test. + with tempfile.TemporaryDirectory() as tmpdir: + subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}', + shell=True) + subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt', + shell=True) + subprocess.check_output( + f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True) + yield from self.yield_storage_object(name=tmp_bucket_name, + source=tmpdir) + + @pytest.fixture + def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj): + # Creates a copy mount storage which reuses an existing storage object. + tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3) + storage_name = tmp_scratch_storage_obj.name + + # Try to initialize another storage with the storage object created + # above, but now in COPY mode. This should succeed. + yield from self.yield_storage_object(name=storage_name, + mode=storage_lib.StorageMode.COPY) + + @pytest.fixture + def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure): + # Creates a temporary storage object for testing .gitignore filter. + # GITIGINORE_STRUCTURE is representing a file structure in a dictionary + # format. Created storage object will contain the file structure along + # with .gitignore and .git/info/exclude files to test exclude filter. + # Stores must be added in the test. + with tempfile.TemporaryDirectory() as tmpdir: + # Creates file structure to be uploaded in the Storage + self.create_dir_structure(tmpdir, gitignore_structure) + + # Create .gitignore and list files/dirs to be excluded in it + skypilot_path = os.path.dirname(os.path.dirname(sky.__file__)) + temp_path = f'{tmpdir}/.gitignore' + file_path = os.path.join(skypilot_path, 'tests/gitignore_test') + shutil.copyfile(file_path, temp_path) + + # Create .git/info/exclude and list files/dirs to be excluded in it + temp_path = f'{tmpdir}/.git/info/' + os.makedirs(temp_path) + temp_exclude_path = os.path.join(temp_path, 'exclude') + file_path = os.path.join(skypilot_path, + 'tests/git_info_exclude_test') + shutil.copyfile(file_path, temp_exclude_path) + + # Create sky Storage with the files created + yield from self.yield_storage_object( + name=tmp_bucket_name, + source=tmpdir, + mode=storage_lib.StorageMode.COPY) + + @pytest.fixture + def tmp_awscli_bucket(self, tmp_bucket_name): + # Creates a temporary bucket using awscli + bucket_uri = f's3://{tmp_bucket_name}' + subprocess.check_call(['aws', 's3', 'mb', bucket_uri]) + yield tmp_bucket_name, bucket_uri + subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force']) + + @pytest.fixture + def tmp_gsutil_bucket(self, tmp_bucket_name): + # Creates a temporary bucket using gsutil + bucket_uri = f'gs://{tmp_bucket_name}' + subprocess.check_call(['gsutil', 'mb', bucket_uri]) + yield tmp_bucket_name, bucket_uri + subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri]) + + @pytest.fixture + def tmp_az_bucket(self, tmp_bucket_name): + # Creates a temporary bucket using gsutil + default_region = 'eastus' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + bucket_uri = data_utils.AZURE_CONTAINER_URL.format( + storage_account_name=storage_account_name, + container_name=tmp_bucket_name) + subprocess.check_call([ + 'az', 'storage', 'container', 'create', '--name', + f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', + '--account-key', f'{storage_account_key}' + ]) + yield tmp_bucket_name, bucket_uri + subprocess.check_call([ + 'az', 'storage', 'container', 'delete', '--name', + f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', + '--account-key', f'{storage_account_key}' + ]) + + @pytest.fixture + def tmp_awscli_bucket_r2(self, tmp_bucket_name): + # Creates a temporary bucket using awscli + endpoint_url = cloudflare.create_endpoint() + bucket_uri = f's3://{tmp_bucket_name}' + subprocess.check_call( + f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2', + shell=True) + yield tmp_bucket_name, bucket_uri + subprocess.check_call( + f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2', + shell=True) + + @pytest.fixture + def tmp_ibm_cos_bucket(self, tmp_bucket_name): + # Creates a temporary bucket using IBM COS API + storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name) + yield tmp_bucket_name + storage_obj.delete() + + @pytest.fixture + def tmp_public_storage_obj(self, request): + # Initializes a storage object with a public bucket + storage_obj = storage_lib.Storage(source=request.param) + yield storage_obj + # This does not require any deletion logic because it is a public bucket + # and should not get added to global_user_state. + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, + store_type): + # Creates a new bucket with a local source, uploads files to it + # and deletes it. + tmp_local_storage_obj.add_store(store_type) + + # Run sky storage ls to check if storage object exists in the output + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_local_storage_obj.name in out.decode('utf-8') + + # Run sky storage delete to delete the storage object + subprocess.check_output( + ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes']) + + # Run sky storage ls to check if storage object is deleted + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_local_storage_obj.name not in out.decode('utf-8') + + @pytest.mark.no_fluidstack + @pytest.mark.xdist_group('multiple_bucket_deletion') + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm) + ]) + def test_multiple_buckets_creation_and_deletion( + self, tmp_multiple_scratch_storage_obj, store_type): + # Creates multiple new buckets(5 buckets) with a local source + # and deletes them. + storage_obj_name = [] + for store_obj in tmp_multiple_scratch_storage_obj: + store_obj.add_store(store_type) + storage_obj_name.append(store_obj.name) + + # Run sky storage ls to check if all storage objects exists in the + # output filtered by store type + out_all = subprocess.check_output(['sky', 'storage', 'ls']) + out = [ + item.split()[0] + for item in out_all.decode('utf-8').splitlines() + if store_type.value in item + ] + assert all([item in out for item in storage_obj_name]) + + # Run sky storage delete all to delete all storage objects + delete_cmd = ['sky', 'storage', 'delete', '--yes'] + delete_cmd += storage_obj_name + subprocess.check_output(delete_cmd) + + # Run sky storage ls to check if all storage objects filtered by store + # type are deleted + out_all = subprocess.check_output(['sky', 'storage', 'ls']) + out = [ + item.split()[0] + for item in out_all.decode('utf-8').splitlines() + if store_type.value in item + ] + assert all([item not in out for item in storage_obj_name]) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_upload_source_with_spaces(self, store_type, + tmp_multiple_custom_source_storage_obj): + # Creates two buckets with specified local sources + # with spaces in the name + storage_obj_names = [] + for storage_obj in tmp_multiple_custom_source_storage_obj: + storage_obj.add_store(store_type) + storage_obj_names.append(storage_obj.name) + + # Run sky storage ls to check if all storage objects exists in the + # output filtered by store type + out_all = subprocess.check_output(['sky', 'storage', 'ls']) + out = [ + item.split()[0] + for item in out_all.decode('utf-8').splitlines() + if store_type.value in item + ] + assert all([item in out for item in storage_obj_names]) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_bucket_external_deletion(self, tmp_scratch_storage_obj, + store_type): + # Creates a bucket, deletes it externally using cloud cli commands + # and then tries to delete it using sky storage delete. + tmp_scratch_storage_obj.add_store(store_type) + + # Run sky storage ls to check if storage object exists in the output + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_scratch_storage_obj.name in out.decode('utf-8') + + # Delete bucket externally + cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name) + subprocess.check_output(cmd, shell=True) + + # Run sky storage delete to delete the storage object + out = subprocess.check_output( + ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes']) + # Make sure bucket was not created during deletion (see issue #1322) + assert 'created' not in out.decode('utf-8').lower() + + # Run sky storage ls to check if storage object is deleted + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_scratch_storage_obj.name not in out.decode('utf-8') + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj): + # Creates a temp folder with over 256 files and folders, upload + # files and folders to a new bucket, then delete bucket. + tmp_bulk_del_storage_obj.add_store(store_type) + + subprocess.check_output([ + 'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes' + ]) + + output = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8') + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize( + 'tmp_public_storage_obj, store_type', + [('s3://tcga-2-open', storage_lib.StoreType.S3), + ('s3://digitalcorpora', storage_lib.StoreType.S3), + ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS), + pytest.param( + 'https://azureopendatastorage.blob.core.windows.net/nyctlc', + storage_lib.StoreType.AZURE, + marks=pytest.mark.azure)], + indirect=['tmp_public_storage_obj']) + def test_public_bucket(self, tmp_public_storage_obj, store_type): + # Creates a new bucket with a public source and verifies that it is not + # added to global_user_state. + tmp_public_storage_obj.add_store(store_type) + + # Run sky storage ls to check if storage object exists in the output + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert tmp_public_storage_obj.name not in out.decode('utf-8') + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize( + 'nonexist_bucket_url', + [ + 's3://{random_name}', + 'gs://{random_name}', + pytest.param( + 'https://{account_name}.blob.core.windows.net/{random_name}', # pylint: disable=line-too-long + marks=pytest.mark.azure), + pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm), + pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare) + ]) + def test_nonexistent_bucket(self, nonexist_bucket_url): + # Attempts to create fetch a stroage with a non-existent source. + # Generate a random bucket name and verify it doesn't exist: + retry_count = 0 + while True: + nonexist_bucket_name = str(uuid.uuid4()) + if nonexist_bucket_url.startswith('s3'): + command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}' + expected_output = '404' + elif nonexist_bucket_url.startswith('gs'): + command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}' + expected_output = 'BucketNotFoundException' + elif nonexist_bucket_url.startswith('https'): + default_region = 'eastus' + storage_account_name = ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) + storage_account_key = data_utils.get_az_storage_account_key( + storage_account_name) + command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}' + expected_output = '"exists": false' + elif nonexist_bucket_url.startswith('r2'): + endpoint_url = cloudflare.create_endpoint() + command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2' + expected_output = '404' + elif nonexist_bucket_url.startswith('cos'): + # Using API calls, since using rclone requires a profile's name + try: + expected_output = command = "echo" # avoid unrelated exception in case of failure. + bucket_name = urllib.parse.urlsplit( + nonexist_bucket_url.format( + random_name=nonexist_bucket_name)).path.strip('/') + client = ibm.get_cos_client('us-east') + client.head_bucket(Bucket=bucket_name) + except ibm.ibm_botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == '404': + # success + return + else: + raise ValueError('Unsupported bucket type ' + f'{nonexist_bucket_url}') + + # Check if bucket exists using the cli: + try: + out = subprocess.check_output(command, + stderr=subprocess.STDOUT, + shell=True) + except subprocess.CalledProcessError as e: + out = e.output + out = out.decode('utf-8') + if expected_output in out: + break + else: + retry_count += 1 + if retry_count > 3: + raise RuntimeError('Unable to find a nonexistent bucket ' + 'to use. This is higly unlikely - ' + 'check if the tests are correct.') + + with pytest.raises(sky.exceptions.StorageBucketGetError, + match='Attempted to use a non-existent'): + if nonexist_bucket_url.startswith('https'): + storage_obj = storage_lib.Storage( + source=nonexist_bucket_url.format( + account_name=storage_account_name, + random_name=nonexist_bucket_name)) + else: + storage_obj = storage_lib.Storage( + source=nonexist_bucket_url.format( + random_name=nonexist_bucket_name)) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize( + 'private_bucket', + [ + f's3://imagenet', + f'gs://imagenet', + pytest.param('https://smoketestprivate.blob.core.windows.net/test', + marks=pytest.mark.azure), # pylint: disable=line-too-long + pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm) + ]) + def test_private_bucket(self, private_bucket): + # Attempts to access private buckets not belonging to the user. + # These buckets are known to be private, but may need to be updated if + # they are removed by their owners. + store_type = urllib.parse.urlsplit(private_bucket).scheme + if store_type == 'https' or store_type == 'cos': + private_bucket_name = urllib.parse.urlsplit( + private_bucket).path.strip('/') + else: + private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc + with pytest.raises( + sky.exceptions.StorageBucketGetError, + match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format( + name=private_bucket_name)): + storage_obj = storage_lib.Storage(source=private_bucket) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('ext_bucket_fixture, store_type', + [('tmp_awscli_bucket', storage_lib.StoreType.S3), + ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), + pytest.param('tmp_az_bucket', + storage_lib.StoreType.AZURE, + marks=pytest.mark.azure), + pytest.param('tmp_ibm_cos_bucket', + storage_lib.StoreType.IBM, + marks=pytest.mark.ibm), + pytest.param('tmp_awscli_bucket_r2', + storage_lib.StoreType.R2, + marks=pytest.mark.cloudflare)]) + def test_upload_to_existing_bucket(self, ext_bucket_fixture, request, + tmp_source, store_type): + # Tries uploading existing files to newly created bucket (outside of + # sky) and verifies that files are written. + bucket_name, _ = request.getfixturevalue(ext_bucket_fixture) + storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source) + storage_obj.add_store(store_type) + + # Check if tmp_source/tmp-file exists in the bucket using aws cli + out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name), + shell=True) + assert 'tmp-file' in out.decode('utf-8'), \ + 'File not found in bucket - output was : {}'.format(out.decode + ('utf-8')) + + # Check symlinks - symlinks don't get copied by sky storage + assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), ( + 'circle-link was not found in the upload source - ' + 'are the test fixtures correct?') + assert 'circle-link' not in out.decode('utf-8'), ( + 'Symlink found in bucket - ls output was : {}'.format( + out.decode('utf-8'))) + + # Run sky storage ls to check if storage object exists in the output. + # It should not exist because the bucket was created externally. + out = subprocess.check_output(['sky', 'storage', 'ls']) + assert storage_obj.name not in out.decode('utf-8') + + @pytest.mark.no_fluidstack + def test_copy_mount_existing_storage(self, + tmp_copy_mnt_existing_storage_obj): + # Creates a bucket with no source in MOUNT mode (empty bucket), and + # then tries to load the same storage in COPY mode. + tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3) + storage_name = tmp_copy_mnt_existing_storage_obj.name + + # Check `sky storage ls` to ensure storage object exists + out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8') + assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.' + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('store_type', [ + storage_lib.StoreType.S3, storage_lib.StoreType.GCS, + pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), + pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), + pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) + ]) + def test_list_source(self, tmp_local_list_storage_obj, store_type): + # Uses a list in the source field to specify a file and a directory to + # be uploaded to the storage object. + tmp_local_list_storage_obj.add_store(store_type) + + # Check if tmp-file exists in the bucket root using cli + out = subprocess.check_output(self.cli_ls_cmd( + store_type, tmp_local_list_storage_obj.name), + shell=True) + assert 'tmp-file' in out.decode('utf-8'), \ + 'File not found in bucket - output was : {}'.format(out.decode + ('utf-8')) + + # Check if tmp-file exists in the bucket/tmp-source using cli + out = subprocess.check_output(self.cli_ls_cmd( + store_type, tmp_local_list_storage_obj.name, 'tmp-source/'), + shell=True) + assert 'tmp-file' in out.decode('utf-8'), \ + 'File not found in bucket - output was : {}'.format(out.decode + ('utf-8')) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('invalid_name_list, store_type', + [(AWS_INVALID_NAMES, storage_lib.StoreType.S3), + (GCS_INVALID_NAMES, storage_lib.StoreType.GCS), + pytest.param(AZURE_INVALID_NAMES, + storage_lib.StoreType.AZURE, + marks=pytest.mark.azure), + pytest.param(IBM_INVALID_NAMES, + storage_lib.StoreType.IBM, + marks=pytest.mark.ibm), + pytest.param(AWS_INVALID_NAMES, + storage_lib.StoreType.R2, + marks=pytest.mark.cloudflare)]) + def test_invalid_names(self, invalid_name_list, store_type): + # Uses a list in the source field to specify a file and a directory to + # be uploaded to the storage object. + for name in invalid_name_list: + with pytest.raises(sky.exceptions.StorageNameError): + storage_obj = storage_lib.Storage(name=name) + storage_obj.add_store(store_type) + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize( + 'gitignore_structure, store_type', + [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3), + (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS), + (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE), + pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, + storage_lib.StoreType.R2, + marks=pytest.mark.cloudflare)]) + def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, + store_type, + tmp_gitignore_storage_obj): + # tests if files included in .gitignore and .git/info/exclude are + # excluded from being transferred to Storage + + tmp_gitignore_storage_obj.add_store(store_type) + + upload_file_name = 'included' + # Count the number of files with the given file name + up_cmd = self.cli_count_name_in_bucket(store_type, \ + tmp_gitignore_storage_obj.name, file_name=upload_file_name) + git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \ + tmp_gitignore_storage_obj.name, file_name='.git') + cnt_num_file_cmd = self.cli_count_file_in_bucket( + store_type, tmp_gitignore_storage_obj.name) + + up_output = subprocess.check_output(up_cmd, shell=True) + git_exclude_output = subprocess.check_output(git_exclude_cmd, + shell=True) + cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True) + + assert '3' in up_output.decode('utf-8'), \ + 'Files to be included are not completely uploaded.' + # 1 is read as .gitignore is uploaded + assert '1' in git_exclude_output.decode('utf-8'), \ + '.git directory should not be uploaded.' + # 4 files include .gitignore, included.log, included.txt, include_dir/included.log + assert '4' in cnt_output.decode('utf-8'), \ + 'Some items listed in .gitignore and .git/info/exclude are not excluded.' + + @pytest.mark.parametrize('ext_bucket_fixture, store_type', + [('tmp_awscli_bucket', storage_lib.StoreType.S3), + ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), + pytest.param('tmp_awscli_bucket_r2', + storage_lib.StoreType.R2, + marks=pytest.mark.cloudflare)]) + def test_externally_created_bucket_mount_without_source( + self, ext_bucket_fixture, request, store_type): + # Non-sky managed buckets(buckets created outside of Skypilot CLI) + # are allowed to be MOUNTed by specifying the URI of the bucket to + # source field only. When it is attempted by specifying the name of + # the bucket only, it should error out. + # + # TODO(doyoung): Add test for IBM COS. Currently, this is blocked + # as rclone used to interact with IBM COS does not support feature to + # create a bucket, and the ibmcloud CLI is not supported in Skypilot. + # Either of the feature is necessary to simulate an external bucket + # creation for IBM COS. + # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837 + + ext_bucket_name, ext_bucket_uri = request.getfixturevalue( + ext_bucket_fixture) + # invalid spec + with pytest.raises(sky.exceptions.StorageSpecError) as e: + storage_obj = storage_lib.Storage( + name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT) + storage_obj.add_store(store_type) + + assert 'Attempted to mount a non-sky managed bucket' in str(e) + + # valid spec + storage_obj = storage_lib.Storage(source=ext_bucket_uri, + mode=storage_lib.StorageMode.MOUNT) + handle = global_user_state.get_handle_from_storage_name( + storage_obj.name) + if handle: + storage_obj.delete() + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('region', [ + 'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1', + 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1', + 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1', + 'us-east-2', 'us-west-1', 'us-west-2' + ]) + def test_aws_regions(self, tmp_local_storage_obj, region): + # This tests creation and upload to bucket in all AWS s3 regions + # To test full functionality, use test_managed_jobs_storage above. + store_type = storage_lib.StoreType.S3 + tmp_local_storage_obj.add_store(store_type, region=region) + bucket_name = tmp_local_storage_obj.name + + # Confirm that the bucket was created in the correct region + region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) + out = subprocess.check_output(region_cmd, shell=True) + output = out.decode('utf-8') + expected_output_region = region + if region == 'us-east-1': + expected_output_region = 'None' # us-east-1 is the default region + assert expected_output_region in out.decode('utf-8'), ( + f'Bucket was not found in region {region} - ' + f'output of {region_cmd} was: {output}') + + # Check if tmp_source/tmp-file exists in the bucket using cli + ls_cmd = self.cli_ls_cmd(store_type, bucket_name) + out = subprocess.check_output(ls_cmd, shell=True) + output = out.decode('utf-8') + assert 'tmp-file' in output, ( + f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') + + @pytest.mark.no_fluidstack + @pytest.mark.parametrize('region', [ + 'northamerica-northeast1', 'northamerica-northeast2', 'us-central1', + 'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2', + 'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1', + 'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1', + 'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6', + 'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12', + 'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2', + 'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2', + 'asia-southeast2', 'me-central1', 'me-central2', 'me-west1', + 'australia-southeast1', 'australia-southeast2', 'africa-south1' + ]) + def test_gcs_regions(self, tmp_local_storage_obj, region): + # This tests creation and upload to bucket in all GCS regions + # To test full functionality, use test_managed_jobs_storage above. + store_type = storage_lib.StoreType.GCS + tmp_local_storage_obj.add_store(store_type, region=region) + bucket_name = tmp_local_storage_obj.name + + # Confirm that the bucket was created in the correct region + region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) + out = subprocess.check_output(region_cmd, shell=True) + output = out.decode('utf-8') + assert region in out.decode('utf-8'), ( + f'Bucket was not found in region {region} - ' + f'output of {region_cmd} was: {output}') + + # Check if tmp_source/tmp-file exists in the bucket using cli + ls_cmd = self.cli_ls_cmd(store_type, bucket_name) + out = subprocess.check_output(ls_cmd, shell=True) + output = out.decode('utf-8') + assert 'tmp-file' in output, ( + f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py index 0fc7ce409fc..3000c82068d 100644 --- a/tests/smoke_tests/test_region_and_zone.py +++ b/tests/smoke_tests/test_region_and_zone.py @@ -1,56 +1,34 @@ -# Smoke tests for SkyPilot +# Smoke tests for SkyPilot for reg # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/test_smoke.py +# > pytest tests/smoke_tests/test_region_and_zone.py # # Terminate failed clusters after test finishes -# > pytest tests/test_smoke.py --terminate-on-failure +# > pytest tests/smoke_tests/test_region_and_zone.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/test_smoke.py::test_minimal -# -# Only run managed job tests -# > pytest tests/test_smoke.py --managed-jobs -# -# Only run sky serve tests -# > pytest tests/test_smoke.py --sky-serve +# > pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region # # Only run test for AWS + generic tests -# > pytest tests/test_smoke.py --aws +# > pytest tests/smoke_tests/test_region_and_zone.py --aws # # Change cloud for generic tests to aws -# > pytest tests/test_smoke.py --generic-cloud aws - -import enum -import inspect -import json -import os -import pathlib -import shlex -import shutil -import subprocess -import sys +# > pytest tests/smoke_tests/test_region_and_zone.py --generic-cloud aws + import tempfile import textwrap -import time -from typing import Dict, List, NamedTuple, Optional, Tuple -import urllib.parse -import uuid -import colorama -import jinja2 import pytest -from smoke_tests.util import _get_cluster_name -from smoke_tests.util import ( - _get_cmd_wait_until_cluster_status_contains_wildcard) -from smoke_tests.util import ( - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) +from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard from smoke_tests.util import run_one_test from smoke_tests.util import Test +from smoke_tests.util import ( + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) from sky.jobs.state import ManagedJobStatus from sky.skylet import constants @@ -60,7 +38,7 @@ # ---------- Test region ---------- @pytest.mark.aws def test_aws_region(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_region', [ @@ -81,7 +59,7 @@ def test_aws_region(): @pytest.mark.aws def test_aws_with_ssh_proxy_command(): - name = _get_cluster_name() + name = get_cluster_name() with tempfile.NamedTemporaryFile(mode='w') as f: f.write( @@ -104,13 +82,12 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. - _get_cmd_wait_until_cluster_status_contains_wildcard( + get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', cluster_status=ClusterStatus.UP.value, timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. - format( + WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( job_name=name, job_status= f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', @@ -123,7 +100,7 @@ def test_aws_with_ssh_proxy_command(): @pytest.mark.gcp def test_gcp_region_and_service_account(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_region', [ @@ -146,7 +123,7 @@ def test_gcp_region_and_service_account(): @pytest.mark.ibm def test_ibm_region(): - name = _get_cluster_name() + name = get_cluster_name() region = 'eu-de' test = Test( 'region', @@ -163,7 +140,7 @@ def test_ibm_region(): @pytest.mark.azure def test_azure_region(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'azure_region', [ @@ -187,7 +164,7 @@ def test_azure_region(): # ---------- Test zone ---------- @pytest.mark.aws def test_aws_zone(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'aws_zone', [ @@ -203,7 +180,7 @@ def test_aws_zone(): @pytest.mark.ibm def test_ibm_zone(): - name = _get_cluster_name() + name = get_cluster_name() zone = 'eu-de-2' test = Test( 'zone', @@ -220,7 +197,7 @@ def test_ibm_zone(): @pytest.mark.gcp def test_gcp_zone(): - name = _get_cluster_name() + name = get_cluster_name() test = Test( 'gcp_zone', [ diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py new file mode 100644 index 00000000000..dd368718821 --- /dev/null +++ b/tests/smoke_tests/test_required_before_merge.py @@ -0,0 +1,46 @@ +# Smoke tests for SkyPilot required before merging +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_required_before_merge.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_required_before_merge.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_required_before_merge.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws + +from smoke_tests.util import get_cluster_name +from smoke_tests.util import run_one_test +from smoke_tests.util import Test +from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID + +from sky.skylet import events +from sky.skylet.job_lib import JobStatus + + +def test_yaml_launch_and_mount(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'test_yaml_launch_and_mount', + [ + f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml', + WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=name, + job_id=1, + job_status=JobStatus.SUCCEEDED.value, + timeout=2 * 60), + ], + f'sky down -y {name}', + timeout=5 * 60, + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py new file mode 100644 index 00000000000..f56d9bb96ee --- /dev/null +++ b/tests/smoke_tests/test_sky_serve.py @@ -0,0 +1,795 @@ +# Smoke tests for SkyPilot for sky serve +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_sky_serve.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_sky_serve.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http +# +# Only run sky serve tests +# > pytest tests/smoke_tests/test_sky_server.py --sky-serve +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_sky_serve.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_sky_serve.py --generic-cloud aws + +import inspect +import json +import shlex +from typing import List, Tuple + +import pytest +from smoke_tests.util import get_cluster_name +from smoke_tests.util import run_one_test +from smoke_tests.util import terminate_gcp_replica +from smoke_tests.util import Test +from smoke_tests.util import test_id + +from sky import serve +from sky.utils import common_utils + +# ---------- Testing skyserve ---------- + + +def _get_service_name() -> str: + """Returns a user-unique service name for each test_skyserve_(). + + Must be called from each test_skyserve_(). + """ + caller_func_name = inspect.stack()[1][3] + test_name = caller_func_name.replace('_', '-').replace('test-', 't-') + test_name = test_name.replace('skyserve-', 'ss-') + test_name = common_utils.make_cluster_name_on_cloud(test_name, 24) + return f'{test_name}-{test_id}' + + +# We check the output of the skyserve service to see if it is ready. Output of +# `REPLICAS` is in the form of `1/2` where the first number is the number of +# ready replicas and the second number is the number of total replicas. We +# grep such format to ensure that the service is ready, and early exit if any +# failure detected. In the end we sleep for +# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have +# enough time to sync with the controller and get all ready replica IPs. +_SERVE_WAIT_UNTIL_READY = ( + '{{ while true; do' + ' s=$(sky serve status {name}); echo "$s";' + ' echo "$s" | grep -q "{replica_num}/{replica_num}" && break;' + ' echo "$s" | grep -q "FAILED" && exit 1;' + ' sleep 10;' + ' done; }}; echo "Got service status $s";' + f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};') +_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' +_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' +_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING' +# Since we don't allow terminate the service if the controller is INIT, +# which is common for simultaneous pytest, we need to wait until the +# controller is UP before we can terminate the service. +# The teardown command has a 10-mins timeout, so we don't need to do +# the timeout here. See implementation of run_one_test() for details. +_TEARDOWN_SERVICE = ( + '(for i in `seq 1 20`; do' + ' s=$(sky serve down -y {name});' + ' echo "Trying to terminate {name}";' + ' echo "$s";' + ' echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;' + ' sleep 10;' + ' [ $i -eq 20 ] && echo "Failed to terminate service {name}";' + 'done)') + +_SERVE_ENDPOINT_WAIT = ( + 'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' + 'endpoint=$(sky serve status --endpoint {name}); ' + 'until ! echo "$endpoint" | grep "Controller is initializing"; ' + 'do echo "Waiting for serve endpoint to be ready..."; ' + 'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; ' + 'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"') + +_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); ' + 'until ! echo "$s" | grep "Controller is initializing."; ' + 'do echo "Waiting for serve status to be ready..."; ' + 'sleep 5; s=$(sky serve status {name}); done; echo "$s"') + + +def _get_replica_ip(name: str, replica_id: int) -> str: + return (f'ip{replica_id}=$(echo "$s" | ' + f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | ' + f'grep -E "{name}\s+{replica_id}" | ' + f'grep -Eo "{_IP_REGEX}")') + + +def _get_skyserve_http_test(name: str, cloud: str, + timeout_minutes: int) -> Test: + test = Test( + f'test-skyserve-{cloud.replace("_", "-")}', + [ + f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'curl http://$endpoint | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=timeout_minutes * 60, + ) + return test + + +def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool, + str]]) -> str: + """Check replicas' status and count in sky serve status + + We will check vCPU=2, as all our tests use vCPU=2. + + Args: + name: the name of the service + check_tuples: A list of replica property to check. Each tuple is + (count, is_spot, status) + """ + check_cmd = '' + for check_tuple in check_tuples: + count, is_spot, status = check_tuple + resource_str = '' + if status not in ['PENDING', 'SHUTTING_DOWN' + ] and not status.startswith('FAILED'): + spot_str = '' + if is_spot: + spot_str = '\[Spot\]' + resource_str = f'({spot_str}vCPU=2)' + check_cmd += (f' echo "$s" | grep "{resource_str}" | ' + f'grep "{status}" | wc -l | grep {count} || exit 1;') + return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd) + + +def _check_service_version(service_name: str, version: str) -> str: + # Grep the lines before 'Service Replicas' and check if the service version + # is correct. + return (f'echo "$s" | grep -B1000 "Service Replicas" | ' + f'grep -E "{service_name}\s+{version}" || exit 1; ') + + +@pytest.mark.gcp +@pytest.mark.serve +def test_skyserve_gcp_http(): + """Test skyserve on GCP""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'gcp', 20) + run_one_test(test) + + +@pytest.mark.aws +@pytest.mark.serve +def test_skyserve_aws_http(): + """Test skyserve on AWS""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'aws', 20) + run_one_test(test) + + +@pytest.mark.azure +@pytest.mark.serve +def test_skyserve_azure_http(): + """Test skyserve on Azure""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'azure', 30) + run_one_test(test) + + +@pytest.mark.kubernetes +@pytest.mark.serve +def test_skyserve_kubernetes_http(): + """Test skyserve on Kubernetes""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'kubernetes', 30) + run_one_test(test) + + +@pytest.mark.oci +@pytest.mark.serve +def test_skyserve_oci_http(): + """Test skyserve on OCI""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'oci', 20) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now +@pytest.mark.serve +def test_skyserve_llm(generic_cloud: str): + """Test skyserve with real LLM usecase""" + name = _get_service_name() + + def generate_llm_test_command(prompt: str, expected_output: str) -> str: + prompt = shlex.quote(prompt) + expected_output = shlex.quote(expected_output) + return ( + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'python tests/skyserve/llm/get_response.py --endpoint $endpoint ' + f'--prompt {prompt} | grep {expected_output}') + + with open('tests/skyserve/llm/prompt_output.json', 'r', + encoding='utf-8') as f: + prompt2output = json.load(f) + + test = Test( + f'test-skyserve-llm', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + *[ + generate_llm_test_command(prompt, output) + for prompt, output in prompt2output.items() + ], + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=40 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.serve +def test_skyserve_spot_recovery(): + name = _get_service_name() + zone = 'us-central1-a' + + test = Test( + f'test-skyserve-spot-recovery-gcp', + [ + f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + terminate_gcp_replica(name, zone, 1), + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Fluidstack does not support spot instances +@pytest.mark.serve +@pytest.mark.no_kubernetes +def test_skyserve_base_ondemand_fallback(generic_cloud: str): + name = _get_service_name() + test = Test( + f'test-skyserve-base-ondemand-fallback', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + _check_replica_in_status(name, [(1, True, 'READY'), + (1, False, 'READY')]), + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.serve +def test_skyserve_dynamic_ondemand_fallback(): + name = _get_service_name() + zone = 'us-central1-a' + + test = Test( + f'test-skyserve-dynamic-ondemand-fallback', + [ + f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml', + f'sleep 40', + # 2 on-demand (provisioning) + 2 Spot (provisioning). + f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";' + 'echo "$s" | grep -q "0/4" || exit 1', + # Wait for the provisioning starts + f'sleep 40', + _check_replica_in_status(name, [ + (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), + (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN') + ]), + + # Wait until 2 spot instances are ready. + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + _check_replica_in_status(name, [(2, True, 'READY'), + (0, False, '')]), + terminate_gcp_replica(name, zone, 1), + f'sleep 40', + # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning). + f'{_SERVE_STATUS_WAIT.format(name=name)}; ' + 'echo "$s" | grep -q "1/3"', + _check_replica_in_status( + name, [(1, True, 'READY'), + (1, True, _SERVICE_LAUNCHING_STATUS_REGEX), + (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), + + # Wait until 2 spot instances are ready. + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + _check_replica_in_status(name, [(2, True, 'READY'), + (0, False, '')]), + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack +@pytest.mark.serve +def test_skyserve_user_bug_restart(generic_cloud: str): + """Tests that we restart the service after user bug.""" + # TODO(zhwu): this behavior needs some rethinking. + name = _get_service_name() + test = Test( + f'test-skyserve-user-bug-restart', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml', + f's=$(sky serve status {name}); echo "$s";' + 'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; ' + 'do echo "Waiting for first service to be SHUTTING DOWN..."; ' + f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ', + f's=$(sky serve status {name}); echo "$s";' + 'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; ' + 'do echo "Waiting for first service to be FAILED..."; ' + f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; ' + + _check_replica_in_status(name, [(1, True, 'FAILED')]) + + # User bug failure will cause no further scaling. + f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; ' + f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"', + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml', + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; ' + + _check_replica_in_status(name, [(1, False, 'READY'), + (1, False, 'FAILED')]), + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +@pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP +def test_skyserve_load_balancer(generic_cloud: str): + """Test skyserve load balancer round-robin policy""" + name = _get_service_name() + test = Test( + f'test-skyserve-load-balancer', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{_SERVE_STATUS_WAIT.format(name=name)}; ' + f'{_get_replica_ip(name, 1)}; ' + f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' + 'python tests/skyserve/load_balancer/test_round_robin.py ' + '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.serve +@pytest.mark.no_kubernetes +def test_skyserve_auto_restart(): + """Test skyserve with auto restart""" + name = _get_service_name() + zone = 'us-central1-a' + test = Test( + f'test-skyserve-auto-restart', + [ + # TODO(tian): we can dynamically generate YAML from template to + # avoid maintaining too many YAML files + f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + # sleep for 20 seconds (initial delay) to make sure it will + # be restarted + f'sleep 20', + terminate_gcp_replica(name, zone, 1), + # Wait for consecutive failure timeout passed. + # If the cluster is not using spot, it won't check the cluster status + # on the cloud (since manual shutdown is not a common behavior and such + # queries takes a lot of time). Instead, we think continuous 3 min probe + # failure is not a temporary problem but indeed a failure. + 'sleep 180', + # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time + # that the output of `sky serve status` shows FAILED and this status will + # cause _SERVE_WAIT_UNTIL_READY to early quit. + '(while true; do' + f' output=$(sky serve status {name});' + ' echo "$output" | grep -q "1/1" && break;' + ' sleep 10;' + f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_cancel(generic_cloud: str): + """Test skyserve with cancel""" + name = _get_service_name() + + test = Test( + f'test-skyserve-cancel', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 ' + 'tests/skyserve/cancel/send_cancel_request.py ' + '--endpoint $endpoint | grep "Request was cancelled"', + f's=$(sky serve logs {name} 1 --no-follow); ' + 'until ! echo "$s" | grep "Please wait for the controller to be"; ' + 'do echo "Waiting for serve logs"; sleep 10; ' + f's=$(sky serve logs {name} 1 --no-follow); done; ' + 'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_streaming(generic_cloud: str): + """Test skyserve with streaming""" + name = _get_service_name() + test = Test( + f'test-skyserve-streaming', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'python3 tests/skyserve/streaming/send_streaming_request.py ' + '--endpoint $endpoint | grep "Streaming test passed"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_readiness_timeout_fail(generic_cloud: str): + """Test skyserve with large readiness probe latency, expected to fail""" + name = _get_service_name() + test = Test( + f'test-skyserve-readiness-timeout-fail', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml', + # None of the readiness probe will pass, so the service will be + # terminated after the initial delay. + f's=$(sky serve status {name}); ' + f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' + 'echo "Waiting for replica to be failed..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done;', + 'sleep 60', + f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;' + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_large_readiness_timeout(generic_cloud: str): + """Test skyserve with customized large readiness timeout""" + name = _get_service_name() + test = Test( + f'test-skyserve-large-readiness-timeout', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack +@pytest.mark.serve +def test_skyserve_update(generic_cloud: str): + """Test skyserve with update""" + name = _get_service_name() + test = Test( + f'test-skyserve-update', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', + f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml', + # sleep before update is registered. + 'sleep 20', + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;' + # Make sure the traffic is not mixed + 'curl http://$endpoint | grep "Hi, new SkyPilot here"', + # The latest 2 version should be READY and the older versions should be shutting down + (_check_replica_in_status(name, [(2, False, 'READY'), + (2, False, 'SHUTTING_DOWN')]) + + _check_service_version(name, "2")), + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack +@pytest.mark.serve +def test_skyserve_rolling_update(generic_cloud: str): + """Test skyserve with rolling update""" + name = _get_service_name() + single_new_replica = _check_replica_in_status( + name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), + (1, False, 'SHUTTING_DOWN')]) + test = Test( + f'test-skyserve-rolling-update', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml', + # Make sure the traffic is mixed across two versions, the replicas + # with even id will sleep 60 seconds before being ready, so we + # should be able to get observe the period that the traffic is mixed + # across two versions. + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; ' + # The latest version should have one READY and the one of the older versions should be shutting down + f'{single_new_replica} {_check_service_version(name, "1,2")} ' + # Check the output from the old version, immediately after the + # output from the new version appears. This is guaranteed by the + # round robin load balancing policy. + # TODO(zhwu): we should have a more generalized way for checking the + # mixed version of replicas to avoid depending on the specific + # round robin load balancing policy. + 'curl http://$endpoint | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack +@pytest.mark.serve +def test_skyserve_fast_update(generic_cloud: str): + """Test skyserve with fast update (Increment version of old replicas)""" + name = _get_service_name() + + test = Test( + f'test-skyserve-fast-update', + [ + f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', + f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', + # sleep to wait for update to be registered. + 'sleep 40', + # 2 on-deamnd (ready) + 1 on-demand (provisioning). + ( + _check_replica_in_status( + name, [(2, False, 'READY'), + (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) + + # Fast update will directly have the latest version ready. + _check_service_version(name, "2")), + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) + + _check_service_version(name, "2"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', + # Test rolling update + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', + # sleep to wait for update to be registered. + 'sleep 25', + # 2 on-deamnd (ready) + 1 on-demand (shutting down). + _check_replica_in_status(name, [(2, False, 'READY'), + (1, False, 'SHUTTING_DOWN')]), + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + _check_service_version(name, "3"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.serve +def test_skyserve_update_autoscale(generic_cloud: str): + """Test skyserve update with autoscale""" + name = _get_service_name() + test = Test( + f'test-skyserve-update-autoscale', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + _check_service_version(name, "1"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'curl http://$endpoint | grep "Hi, SkyPilot here"', + f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml', + # sleep before update is registered. + 'sleep 20', + # Timeout will be triggered when update fails. + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) + + _check_service_version(name, "2"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'curl http://$endpoint | grep "Hi, SkyPilot here!"', + # Rolling Update + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', + # sleep before update is registered. + 'sleep 20', + # Timeout will be triggered when update fails. + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + _check_service_version(name, "3"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'curl http://$endpoint | grep "Hi, SkyPilot here!"', + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=30 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack +@pytest.mark.serve +@pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes +@pytest.mark.parametrize('mode', ['rolling', 'blue_green']) +def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): + """Test skyserve with update that changes autoscaler""" + name = f'{_get_service_name()}-{mode}' + + wait_until_no_pending = ( + f's=$(sky serve status {name}); echo "$s"; ' + 'until ! echo "$s" | grep PENDING; do ' + ' echo "Waiting for replica to be out of pending..."; ' + f' sleep 5; s=$(sky serve status {name}); ' + ' echo "$s"; ' + 'done') + four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')]) + update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;'] + if mode == 'rolling': + # Check rolling update, it will terminate one of the old on-demand + # instances, once there are 4 spot instance ready. + update_check += [ + _check_replica_in_status( + name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), + (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) + + _check_service_version(name, "1,2"), + ] + else: + # Check blue green update, it will keep both old on-demand instances + # running, once there are 4 spot instance ready. + update_check += [ + _check_replica_in_status( + name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), + (2, False, 'READY')]) + + _check_service_version(name, "1"), + ] + test = Test( + f'test-skyserve-new-autoscaler-update-{mode}', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + _check_service_version(name, "1"), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"', + f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml', + # Wait for update to be registered + f'sleep 90', + wait_until_no_pending, + _check_replica_in_status( + name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), + (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), + (2, False, 'READY')]), + *update_check, + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5), + f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + 'curl http://$endpoint | grep "Hi, SkyPilot here"', + _check_replica_in_status(name, [(4, True, 'READY'), + (1, False, 'READY')]), + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack +@pytest.mark.serve +def test_skyserve_failures(generic_cloud: str): + """Test replica failure statuses""" + name = _get_service_name() + + test = Test( + 'test-skyserve-failures', + [ + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml', + f's=$(sky serve status {name}); ' + f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' + 'echo "Waiting for replica to be failed..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done;', + 'sleep 60', + f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; ' + # Make sure no new replicas are started for early failure. + f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;', + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml', + f's=$(sky serve status {name}); ' + # Wait for replica to be ready. + f'until echo "$s" | grep "READY"; do ' + 'echo "Waiting for replica to be failed..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done;', + # Wait for replica to change to FAILED_PROBING + f's=$(sky serve status {name}); ' + f'until echo "$s" | grep "FAILED_PROBING"; do ' + 'echo "Waiting for replica to be failed..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done', + # Wait for the PENDING replica to appear. + 'sleep 10', + # Wait until the replica is out of PENDING. + f's=$(sky serve status {name}); ' + f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do ' + 'echo "Waiting for replica to be out of pending..."; sleep 5; ' + f's=$(sky serve status {name}); echo "$s"; done; ' + + _check_replica_in_status( + name, [(1, False, 'FAILED_PROBING'), + (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), + # TODO(zhwu): add test for FAILED_PROVISION + ], + _TEARDOWN_SERVICE.format(name=name), + timeout=20 * 60, + ) + run_one_test(test) + + +# TODO(Ziming, Tian): Add tests for autoscaling. + + +# ------- Testing user dependencies -------- +def test_user_dependencies(generic_cloud: str): + name = get_cluster_name() + test = Test( + 'user-dependencies', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"', + f'sky logs {name} 1 --status', + f'sky exec {name} "echo hi"', + f'sky logs {name} 2 --status', + f'sky status -r {name} | grep UP', + f'sky exec {name} "echo bye"', + f'sky logs {name} 3 --status', + f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml', + f'sky logs {name} 4 --status', + # Launch again to test the default env does not affect SkyPilot + # runtime setup + f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"', + f'sky logs {name} 5 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) diff --git a/tests/smoke_tests/test_smoke.py b/tests/smoke_tests/test_smoke.py deleted file mode 100644 index 348c880d7a7..00000000000 --- a/tests/smoke_tests/test_smoke.py +++ /dev/null @@ -1,5077 +0,0 @@ -# Smoke tests for SkyPilot -# Default options are set in pyproject.toml -# Example usage: -# Run all tests except for AWS and Lambda Cloud -# > pytest tests/test_smoke.py -# -# Terminate failed clusters after test finishes -# > pytest tests/test_smoke.py --terminate-on-failure -# -# Re-run last failed tests -# > pytest --lf -# -# Run one of the smoke tests -# > pytest tests/test_smoke.py::test_minimal -# -# Only run managed job tests -# > pytest tests/test_smoke.py --managed-jobs -# -# Only run sky serve tests -# > pytest tests/test_smoke.py --sky-serve -# -# Only run test for AWS + generic tests -# > pytest tests/test_smoke.py --aws -# -# Change cloud for generic tests to aws -# > pytest tests/test_smoke.py --generic-cloud aws - -import inspect -import json -import os -import pathlib -import shlex -import shutil -import subprocess -import sys -import tempfile -import textwrap -import time -from typing import Dict, List, Optional, Tuple -import urllib.parse -import uuid - -import jinja2 -import pytest -from smoke_tests.util import _BUMP_UP_SECONDS -from smoke_tests.util import _get_cluster_name -from smoke_tests.util import _GET_JOB_QUEUE -from smoke_tests.util import _get_timeout -from smoke_tests.util import _JOB_WAIT_NOT_RUNNING -from smoke_tests.util import _terminate_gcp_replica -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID -from smoke_tests.util import ( - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB) -from smoke_tests.util import ( - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) -from smoke_tests.util import get_aws_region_for_quota_failover -from smoke_tests.util import get_gcp_region_for_quota_failover -from smoke_tests.util import LAMBDA_TYPE -from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_GPU_V100 -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import STORAGE_SETUP_COMMANDS -from smoke_tests.util import Test -from smoke_tests.util import test_id - -import sky -from sky import global_user_state -from sky import jobs -from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm -from sky.clouds import AWS -from sky.clouds import Azure -from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone -from sky.jobs.state import ManagedJobStatus -from sky.skylet import constants -from sky.skylet import events -from sky.skylet.job_lib import JobStatus -from sky.status_lib import ClusterStatus -from sky.utils import common_utils -from sky.utils import resources_utils -from sky.utils import subprocess_utils - - -# ------------ Test stale job ------------ -@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances -@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances -def test_stale_job(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'stale_job', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=100), - f'sky start {name} -y', - f'sky logs {name} 1 --status', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_stale_job_manual_restart(): - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.AWS.max_cluster_name_length()) - region = 'us-east-2' - test = Test( - 'aws_stale_job_manual_restart', - [ - f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - # Stop the cluster manually. - f'id=`aws ec2 describe-instances --region {region} --filters ' - f'Name=tag:ray-cluster-name,Values={name_on_cloud} ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text`; ' - f'aws ec2 stop-instances --region {region} ' - '--instance-ids $id', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=40), - f'sky launch -c {name} -y "echo hi"', - f'sky logs {name} 1 --status', - f'sky logs {name} 3 --status', - # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( - cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, - timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_stale_job_manual_restart(): - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.GCP.max_cluster_name_length()) - zone = 'us-west2-a' - query_cmd = (f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name={name_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - stop_cmd = (f'gcloud compute instances stop --zone={zone}' - f' --quiet $({query_cmd})') - test = Test( - 'gcp_stale_job_manual_restart', - [ - f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - # Stop the cluster manually. - stop_cmd, - 'sleep 40', - f'sky launch -c {name} -y "echo hi"', - f'sky logs {name} 1 --status', - f'sky logs {name} 3 --status', - # Ensure the skylet updated the stale job status. - _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( - cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, - timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Check Sky's environment variables; workdir. ---------- -@pytest.mark.no_fluidstack # Requires amazon S3 -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_env_check(generic_cloud: str): - name = _get_cluster_name() - total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = Test( - 'env_check', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - run_one_test(test) - - -# ---------- file_mounts ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. -def test_file_mounts(generic_cloud: str): - name = _get_cluster_name() - extra_flags = '' - if generic_cloud in 'kubernetes': - # Kubernetes does not support multi-node - # NOTE: This test will fail if you have a Kubernetes cluster running on - # arm64 (e.g., Apple Silicon) since goofys does not work on arm64. - extra_flags = '--num-nodes 1' - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ] - test = Test( - 'using_file_mounts', - test_commands, - f'sky down -y {name}', - _get_timeout(generic_cloud, 20 * 60), # 20 mins - ) - run_one_test(test) - - -@pytest.mark.scp -def test_scp_file_mounts(): - name = _get_cluster_name() - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ] - test = Test( - 'SCP_using_file_mounts', - test_commands, - f'sky down -y {name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Requires GCP to be enabled -def test_using_file_mounts_with_env_vars(generic_cloud: str): - name = _get_cluster_name() - storage_name = TestStorageWithCredentials.generate_bucket_name() - test_commands = [ - *STORAGE_SETUP_COMMANDS, - (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} ' - 'examples/using_file_mounts_with_env_vars.yaml ' - f'--env MY_BUCKET={storage_name}'), - f'sky logs {name} 1 --status', # Ensure the job succeeded. - # Override with --env: - (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} ' - 'examples/using_file_mounts_with_env_vars.yaml ' - f'--env MY_BUCKET={storage_name} ' - '--env MY_LOCAL_PATH=tmpfile'), - f'sky logs {name}-2 1 --status', # Ensure the job succeeded. - ] - test = Test( - 'using_file_mounts_with_env_vars', - test_commands, - (f'sky down -y {name} {name}-2', - f'sky storage delete -y {storage_name} {storage_name}-2'), - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -# ---------- storage ---------- -@pytest.mark.aws -def test_aws_storage_mounts_with_stop(): - name = _get_cluster_name() - cloud = 'aws' - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name, cloud=cloud) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {cloud} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'aws s3 ls {storage_name}/hello.txt', - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'aws_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_storage_mounts_with_stop(): - name = _get_cluster_name() - cloud = 'gcp' - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name, cloud=cloud) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {cloud} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'gsutil ls gs://{storage_name}/hello.txt', - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'gcp_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_storage_mounts_with_stop(): - name = _get_cluster_name() - cloud = 'azure' - storage_name = f'sky-test-{int(time.time())}' - default_region = 'eastus' - storage_account_name = (storage_lib.AzureBlobStore. - get_default_storage_account_name(default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name, cloud=cloud) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {cloud} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)' - # if the file does not exist, az storage blob list returns '[]' - f'[ "$output" = "[]" ] && exit 1;' - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'azure_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_storage_mounts(): - # Tests bucket mounting on k8s, assuming S3 is configured. - # This test will fail if run on non x86_64 architecture, since goofys is - # built for x86_64 only. - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud kubernetes {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'aws s3 ls {storage_name}/hello.txt || ' - f'gsutil ls gs://{storage_name}/hello.txt', - ] - test = Test( - 'kubernetes_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_context_switch(): - name = _get_cluster_name() - new_context = f'sky-test-context-{int(time.time())}' - new_namespace = f'sky-test-namespace-{int(time.time())}' - - test_commands = [ - # Launch a cluster and run a simple task - f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"', - f'sky logs {name} 1 --status', # Ensure job succeeded - - # Get current context details and save to a file for later use in cleanup - 'CURRENT_CONTEXT=$(kubectl config current-context); ' - 'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; ' - 'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); ' - 'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); ' - - # Create a new context with a different name and namespace - f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}', - - # Create the new namespace if it doesn't exist - f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -', - - # Set the new context as active - f'kubectl config use-context {new_context}', - - # Verify the new context is active - f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1', - - # Try to run sky exec on the original cluster (should still work) - f'sky exec {name} "echo Success: sky exec works after context switch"', - - # Test sky queue - f'sky queue {name}', - - # Test SSH access - f'ssh {name} whoami', - ] - - cleanup_commands = ( - f'kubectl delete namespace {new_namespace}; ' - f'kubectl config delete-context {new_context}; ' - 'kubectl config use-context $(cat /tmp/sky_test_current_context); ' - 'rm /tmp/sky_test_current_context; ' - f'sky down -y {name}') - - test = Test( - 'kubernetes_context_switch', - test_commands, - cleanup_commands, - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - ]) -def test_docker_storage_mounts(generic_cloud: str, image_id: str): - # Tests bucket mounting on docker container - name = _get_cluster_name() - timestamp = str(time.time()).replace('.', '') - storage_name = f'sky-test-{timestamp}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3. - azure_mount_unsupported_ubuntu_version = '18.04' - # Commands to verify bucket upload. We need to check all three - # storage types because the optimizer may pick any of them. - s3_command = f'aws s3 ls {storage_name}/hello.txt' - gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt' - azure_blob_command = TestStorageWithCredentials.cli_ls_cmd( - storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt') - if azure_mount_unsupported_ubuntu_version in image_id: - # The store for mount_private_mount is not specified in the template. - # If we're running on Azure, the private mount will be created on - # azure blob. That will not be supported on the ubuntu 18.04 image - # and thus fail. For other clouds, the private mount on other - # storage types (GCS/S3) should succeed. - include_private_mount = False if generic_cloud == 'azure' else True - content = template.render(storage_name=storage_name, - include_azure_mount=False, - include_private_mount=include_private_mount) - else: - content = template.render(storage_name=storage_name,) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - # Check AWS, GCP, or Azure storage mount. - f'{s3_command} || ' - f'{gsutil_command} || ' - f'{azure_blob_command}', - ] - test = Test( - 'docker_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.cloudflare -def test_cloudflare_storage_mounts(generic_cloud: str): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_r2_storage_mounting.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - endpoint_url = cloudflare.create_endpoint() - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2' - ] - - test = Test( - 'cloudflare_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_storage_mounts(): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - storage_name, Rclone.RcloneClouds.IBM) - template_str = pathlib.Path( - 'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud ibm {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt', - ] - test = Test( - 'ibm_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -# ---------- CLI logs ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead. -def test_cli_logs(generic_cloud: str): - name = _get_cluster_name() - num_nodes = 2 - if generic_cloud == 'kubernetes': - # Kubernetes does not support multi-node - num_nodes = 1 - timestamp = time.time() - test = Test('cli_logs', [ - f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"', - f'sky exec {name} "echo {timestamp} 2"', - f'sky exec {name} "echo {timestamp} 3"', - f'sky exec {name} "echo {timestamp} 4"', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 4 --sync-down', - f'sky logs {name} * --sync-down', - f'sky logs {name} 1 | grep "{timestamp} 1"', - f'sky logs {name} | grep "{timestamp} 4"', - ], f'sky down -y {name}') - run_one_test(test) - - -@pytest.mark.scp -def test_scp_logs(): - name = _get_cluster_name() - timestamp = time.time() - test = Test( - 'SCP_cli_logs', - [ - f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"', - f'sky exec {name} "echo {timestamp} 2"', - f'sky exec {name} "echo {timestamp} 3"', - f'sky exec {name} "echo {timestamp} 4"', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 4 --sync-down', - f'sky logs {name} * --sync-down', - f'sky logs {name} 1 | grep "{timestamp} 1"', - f'sky logs {name} | grep "{timestamp} 4"', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Job Queue. ---------- -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead -@pytest.mark.no_scp # SCP does not have T4 gpus. Run test_scp_job_queue instead -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. -@pytest.mark.no_oci # OCI does not have T4 gpus -def test_job_queue(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'job_queue', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Job Queue with Docker. ---------- -@pytest.mark.no_fluidstack # FluidStack does not support docker for now -@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now -@pytest.mark.no_ibm # Doesn't support IBM Cloud for now -@pytest.mark.no_paperspace # Paperspace doesn't have T4 GPUs -@pytest.mark.no_scp # Doesn't support SCP for now -@pytest.mark.no_oci # Doesn't support OCI for now -@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test latest image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - # Axolotl image is a good example custom image that has its conda path - # set in PATH with dockerfile and uses python>=3.12. It could test: - # 1. we handle the env var set in dockerfile correctly - # 2. python>=3.12 works with SkyPilot runtime. - 'docker:winglian/axolotl:main-latest' - ]) -def test_job_queue_with_docker(generic_cloud: str, image_id: str): - name = _get_cluster_name() + image_id[len('docker:'):][:4] - total_timeout_minutes = 40 if generic_cloud == 'azure' else 15 - time_to_sleep = 300 if generic_cloud == 'azure' else 180 - test = Test( - 'job_queue_with_docker', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', - f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - # Make sure the GPU is still visible to the container. - f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', - f'sky logs {name} 4 --status', - f'sky stop -y {name}', - # Make sure the job status preserve after stop and start the - # cluster. This is also a test for the docker container to be - # preserved after stop and start. - f'sky start -y {name}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - # Make sure it is still visible after an stop & start cycle. - f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', - f'sky logs {name} 7 --status' - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - run_one_test(test) - - -@pytest.mark.lambda_cloud -def test_lambda_job_queue(): - name = _get_cluster_name() - test = Test( - 'lambda_job_queue', - [ - f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_job_queue(): - name = _get_cluster_name() - test = Test( - 'ibm_job_queue', - [ - f'sky launch -y -c {name} --cloud ibm --gpus v100', - f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.scp -def test_scp_job_queue(): - name = _get_cluster_name() - num_of_gpu_launch = 1 - num_of_gpu_exec = 0.5 - test = Test( - 'SCP_job_queue', - [ - f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_oci # OCI Cloud does not have T4 gpus. -@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet -def test_job_queue_multinode(generic_cloud: str): - name = _get_cluster_name() - total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 - test = Test( - 'job_queue_multinode', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml', - f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', - 'sleep 90', - f'sky cancel -y {name} 1', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP', - f'sky cancel -y {name} 1 2 3', - f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml', - # Test the job status is correctly set to SETTING_UP, during the setup is running, - # and the job can be cancelled during the setup. - 'sleep 5', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', - f'sky cancel -y {name} 4', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs -@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs -def test_large_job_queue(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'large_job_queue', - [ - f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', - f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', - f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', - 'sleep 90', - - # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. - # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', - # Make sure the jobs are scheduled in FIFO order - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' - for i in range(1, 17) - ], - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' - for i in range(17, 33) - ], - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING' - for i in range(33, 75) - ], - f'sky cancel -y {name} 33 35 37 39 17 18 19', - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' - for i in range(33, 40, 2) - ], - 'sleep 10', - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' - for i in [34, 36, 38] - ], - ], - f'sky down -y {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs -@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs -def test_fast_large_job_queue(generic_cloud: str): - # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. - name = _get_cluster_name() - test = Test( - 'fast_large_job_queue', - [ - f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', - f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done', - 'sleep 60', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32', - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_job_queue_multinode(): - name = _get_cluster_name() - task_file = 'examples/job_queue/job_multinode_ibm.yaml' - test = Test( - 'ibm_job_queue_multinode', - [ - f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2', - f'sky exec {name} -n {name}-1 -d {task_file}', - f'sky exec {name} -n {name}-2 -d {task_file}', - f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)', - 'sleep 90', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', - f'sky cancel -y {name} 1', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 1 2 3', - f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}', - # Test the job status is correctly set to SETTING_UP, during the setup is running, - # and the job can be cancelled during the setup. - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', - f'sky cancel -y {name} 4', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', - f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -# ---------- Docker with preinstalled package. ---------- -@pytest.mark.no_fluidstack # Doesn't support Fluidstack for now -@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now -@pytest.mark.no_ibm # Doesn't support IBM Cloud for now -@pytest.mark.no_scp # Doesn't support SCP for now -@pytest.mark.no_oci # Doesn't support OCI for now -@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now -# TODO(zhwu): we should fix this for kubernetes -def test_docker_preinstalled_package(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'docker_with_preinstalled_package', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx', - f'sky exec {name} "nginx -V"', - f'sky logs {name} 1 --status', - f'sky exec {name} whoami | grep root', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Submitting multiple tasks to the same cluster. ---------- -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_oci # OCI Cloud does not have T4 gpus -def test_multi_echo(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'multi_echo', - [ - f'python examples/multi_echo.py {name} {generic_cloud}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 10', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 30', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 30', - # Make sure that our job scheduler is fast enough to have at least - # 10 RUNNING jobs in parallel. - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'', - 'sleep 30', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done', - ] + - # Ensure jobs succeeded. - [ - _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( - cluster_name=name, - job_id=i + 1, - job_status=JobStatus.SUCCEEDED.value, - timeout=120) for i in range(32) - ] + - # Ensure monitor/autoscaler didn't crash on the 'assert not - # unfulfilled' error. If process not found, grep->ssh returns 1. - [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], - f'sky down -y {name}', - timeout=20 * 60, - ) - run_one_test(test) - - -# ---------- Task: 1 node training. ---------- -@pytest.mark.no_fluidstack # Fluidstack does not have T4 gpus for now -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. -def test_huggingface(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.lambda_cloud -def test_lambda_huggingface(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'lambda_huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.scp -def test_scp_huggingface(generic_cloud: str): - name = _get_cluster_name() - num_of_gpu_launch = 1 - test = Test( - 'SCP_huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Inferentia. ---------- -@pytest.mark.aws -def test_inferentia(): - name = _get_cluster_name() - test = Test( - 'test_inferentia', - [ - f'sky launch -y -c {name} -t inf2.xlarge -- echo hi', - f'sky exec {name} --gpus Inferentia:1 echo hi', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- TPU. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu(): - name = _get_cluster_name() - test = Test( - 'tpu_app', - [ - f'sky launch -y -c {name} examples/tpu/tpu_app.yaml', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. - ], - f'sky down -y {name}', - timeout=30 * 60, # can take >20 mins - ) - run_one_test(test) - - -# ---------- TPU VM. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu_vm(): - name = _get_cluster_name() - test = Test( - 'tpu_vm_app', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', # Ensure the cluster is STOPPED. - # Use retry: guard against transient errors observed for - # just-stopped TPU VMs (#962). - f'sky start --retry-until-up -y {name}', - f'sky exec {name} examples/tpu/tpuvm_mnist.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - run_one_test(test) - - -# ---------- TPU VM Pod. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu_vm_pod(): - name = _get_cluster_name() - test = Test( - 'tpu_pod', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - run_one_test(test) - - -# ---------- TPU Pod Slice on GKE. ---------- -@pytest.mark.kubernetes -def test_tpu_pod_slice_gke(): - name = _get_cluster_name() - test = Test( - 'tpu_pod_slice_gke', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. - f'sky logs {name} 2 --status' - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - run_one_test(test) - - -# ---------- Simple apps. ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_multi_hostname(generic_cloud: str): - name = _get_cluster_name() - total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = Test( - 'multi_hostname', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2', # Ensure there are 2 hosts. - f'sky exec {name} examples/multi_hostname.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=_get_timeout(generic_cloud, total_timeout_minutes * 60), - ) - run_one_test(test) - - -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_multi_node_failure(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'multi_node_failure', - [ - # TODO(zhwu): we use multi-thread to run the commands in setup - # commands in parallel, which makes it impossible to fail fast - # when one of the nodes fails. We should fix this in the future. - # The --detach-setup version can fail fast, as the setup is - # submitted to the remote machine, which does not use multi-thread. - # Refer to the comment in `subprocess_utils.run_in_parallel`. - # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1', # Ensure the job setup failed. - f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml', - f'sky logs {name} 1 --status | grep FAILED_SETUP', # Ensure the job setup failed. - f'sky exec {name} tests/test_yamls/failed_worker_run.yaml', - f'sky logs {name} 2 --status | grep FAILED', # Ensure the job failed. - f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2', # Ensure there 2 of the hosts printed their hostname. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on GCP. ---------- -@pytest.mark.gcp -def test_gcp_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'gcp_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on AWS. ---------- -@pytest.mark.aws -def test_aws_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'aws_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on Azure. ---------- -@pytest.mark.azure -def test_azure_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'azure_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on Kubernetes. ---------- -@pytest.mark.kubernetes -def test_kubernetes_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'kubernetes_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on Paperspace. ---------- -@pytest.mark.paperspace -def test_paperspace_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'paperspace_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Web apps with custom ports on RunPod. ---------- -@pytest.mark.runpod -def test_runpod_http_server_with_custom_ports(): - name = _get_cluster_name() - test = Test( - 'runpod_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Labels from task on AWS (instance_tags) ---------- -@pytest.mark.aws -def test_task_labels_aws(): - name = _get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='aws', region='us-east-1') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = Test( - 'task_labels_aws', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with aws cli that the tags are set. - 'aws ec2 describe-instances ' - '--query "Reservations[*].Instances[*].InstanceId" ' - '--filters "Name=instance-state-name,Values=running" ' - f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" ' - '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" ' - '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" ' - '--region us-east-1 --output text', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Labels from task on GCP (labels) ---------- -@pytest.mark.gcp -def test_task_labels_gcp(): - name = _get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='gcp') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = Test( - 'task_labels_gcp', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with gcloud cli that the tags are set - f'gcloud compute instances list --filter="name~\'^{name}\' AND ' - 'labels.inlinelabel1=\'inlinevalue1\' AND ' - 'labels.inlinelabel2=\'inlinevalue2\'" ' - '--format="value(name)" | grep .', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Labels from task on Kubernetes (labels) ---------- -@pytest.mark.kubernetes -def test_task_labels_kubernetes(): - name = _get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='kubernetes') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = Test( - 'task_labels_kubernetes', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with kubectl that the labels are set. - 'kubectl get pods ' - '--selector inlinelabel1=inlinevalue1 ' - '--selector inlinelabel2=inlinevalue2 ' - '-o jsonpath=\'{.items[*].metadata.name}\' | ' - f'grep \'^{name}\'' - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Pod Annotations on Kubernetes ---------- -@pytest.mark.kubernetes -def test_add_pod_annotations_for_autodown_with_launch(): - name = _get_cluster_name() - test = Test( - 'add_pod_annotations_for_autodown_with_launch', - [ - # Launch Kubernetes cluster with two nodes, each being head node and worker node. - # Autodown is set. - f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes', - # Get names of the pods containing cluster name. - f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', - f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', - # Describe the first pod and check for annotations. - 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check for annotations. - 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop' - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_add_and_remove_pod_annotations_with_autostop(): - name = _get_cluster_name() - test = Test( - 'add_and_remove_pod_annotations_with_autostop', - [ - # Launch Kubernetes cluster with two nodes, each being head node and worker node. - f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes', - # Set autodown on the cluster with 'autostop' command. - f'sky autostop -y {name} -i 20 --down', - # Get names of the pods containing cluster name. - f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', - f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', - # Describe the first pod and check for annotations. - 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check for annotations. - 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', - # Cancel the set autodown to remove the annotations from the pods. - f'sky autostop -y {name} --cancel', - # Describe the first pod and check if annotations are removed. - '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check if annotations are removed. - '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Container logs from task on Kubernetes ---------- -@pytest.mark.kubernetes -def test_container_logs_multinode_kubernetes(): - name = _get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' - head_logs = ('kubectl get pods ' - f' | grep {name} | grep head | ' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - worker_logs = ('kubectl get pods ' - f' | grep {name} | grep worker |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( - 'container_logs_multinode_kubernetes', - [ - f'sky launch -y -c {name} {task_yaml} --num-nodes 2', - f'{head_logs} | wc -l | grep 9', - f'{worker_logs} | wc -l | grep 9', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_container_logs_two_jobs_kubernetes(): - name = _get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' - pod_logs = ('kubectl get pods ' - f' | grep {name} | grep head |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( - 'test_container_logs_two_jobs_kubernetes', - [ - f'sky launch -y -c {name} {task_yaml}', - f'{pod_logs} | wc -l | grep 9', - f'sky launch -y -c {name} {task_yaml}', - f'{pod_logs} | wc -l | grep 18', - f'{pod_logs} | grep 1 | wc -l | grep 2', - f'{pod_logs} | grep 2 | wc -l | grep 2', - f'{pod_logs} | grep 3 | wc -l | grep 2', - f'{pod_logs} | grep 4 | wc -l | grep 2', - f'{pod_logs} | grep 5 | wc -l | grep 2', - f'{pod_logs} | grep 6 | wc -l | grep 2', - f'{pod_logs} | grep 7 | wc -l | grep 2', - f'{pod_logs} | grep 8 | wc -l | grep 2', - f'{pod_logs} | grep 9 | wc -l | grep 2', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_container_logs_two_simultaneous_jobs_kubernetes(): - name = _get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml ' - pod_logs = ('kubectl get pods ' - f' | grep {name} | grep head |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( - 'test_container_logs_two_simultaneous_jobs_kubernetes', - [ - f'sky launch -y -c {name}', - f'sky exec -c {name} -d {task_yaml}', - f'sky exec -c {name} -d {task_yaml}', - 'sleep 30', - f'{pod_logs} | wc -l | grep 18', - f'{pod_logs} | grep 1 | wc -l | grep 2', - f'{pod_logs} | grep 2 | wc -l | grep 2', - f'{pod_logs} | grep 3 | wc -l | grep 2', - f'{pod_logs} | grep 4 | wc -l | grep 2', - f'{pod_logs} | grep 5 | wc -l | grep 2', - f'{pod_logs} | grep 6 | wc -l | grep 2', - f'{pod_logs} | grep 7 | wc -l | grep 2', - f'{pod_logs} | grep 8 | wc -l | grep 2', - f'{pod_logs} | grep 9 | wc -l | grep 2', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Task: n=2 nodes with setups. ---------- -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.skip( - reason= - 'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.') -def test_distributed_tf(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'resnet_distributed_tf_app', - [ - # NOTE: running it twice will hang (sometimes?) - an app-level bug. - f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=25 * 60, # 25 mins (it takes around ~19 mins) - ) - run_one_test(test) - - -# ---------- Testing GCP start and stop instances ---------- -@pytest.mark.gcp -def test_gcp_start_stop(): - name = _get_cluster_name() - test = Test( - 'gcp-start-stop', - [ - f'sky launch -y -c {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. - f'sky logs {name} 3 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=40), - f'sky start -y {name} -i 1', - f'sky exec {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', - timeout=200), - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Testing Azure start and stop instances ---------- -@pytest.mark.azure -def test_azure_start_stop(): - name = _get_cluster_name() - test = Test( - 'azure-start-stop', - [ - f'sky launch -y -c {name} examples/azure_start_stop.yaml', - f'sky exec {name} examples/azure_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f'sky start -y {name} -i 1', - f'sky exec {name} examples/azure_start_stop.yaml', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', - timeout=280) + - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', - ], - f'sky down -y {name}', - timeout=30 * 60, # 30 mins - ) - run_one_test(test) - - -# ---------- Testing Autostopping ---------- -@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances -@pytest.mark.no_ibm # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_kubernetes # Kubernetes does not autostop yet -def test_autostop(generic_cloud: str): - name = _get_cluster_name() - # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure - # the VM is stopped. - autostop_timeout = 600 if generic_cloud == 'azure' else 250 - # Launching and starting Azure clusters can take a long time too. e.g., restart - # a stopped Azure cluster can take 7m. So we set the total timeout to 70m. - total_timeout_minutes = 70 if generic_cloud == 'azure' else 20 - test = Test( - 'autostop', - [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} -i 1', - - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m"', - - # Ensure the cluster is not stopped early. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - - # Ensure the cluster is STOPPED. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout), - - # Ensure the cluster is UP and the autostop setting is reset ('-'). - f'sky start -y {name}', - f'sky status | grep {name} | grep -E "UP\s+-"', - - # Ensure the job succeeded. - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - - # Test restarting the idleness timer via reset: - f'sky autostop -y {name} -i 1', # Idleness starts counting. - 'sleep 40', # Almost reached the threshold. - f'sky autostop -y {name} -i 1', # Should restart the timer. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout), - - # Test restarting the idleness timer via exec: - f'sky start -y {name}', - f'sky status | grep {name} | grep -E "UP\s+-"', - f'sky autostop -y {name} -i 1', # Idleness starts counting. - 'sleep 45', # Almost reached the threshold. - f'sky exec {name} echo hi', # Should restart the timer. - 'sleep 45', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=autostop_timeout + _BUMP_UP_SECONDS), - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - run_one_test(test) - - -# ---------- Testing Autodowning ---------- -@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead. -def test_autodown(generic_cloud: str): - name = _get_cluster_name() - # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure - # the VM is terminated. - autodown_timeout = 900 if generic_cloud == 'azure' else 240 - total_timeout_minutes = 90 if generic_cloud == 'azure' else 20 - test = Test( - 'autodown', - [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --down -i 1', - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m (down)"', - # Ensure the cluster is not terminated early. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - # Ensure the cluster is terminated. - f'sleep {autodown_timeout}', - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep "1m (down)"', - f'sleep {autodown_timeout}', - # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --cancel', - f'sleep {autodown_timeout}', - # Ensure the cluster is still UP. - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP', - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - run_one_test(test) - - -@pytest.mark.scp -def test_scp_autodown(): - name = _get_cluster_name() - test = Test( - 'SCP_autodown', - [ - f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --down -i 1', - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m (down)"', - # Ensure the cluster is not terminated early. - 'sleep 45', - f'sky status --refresh | grep {name} | grep UP', - # Ensure the cluster is terminated. - 'sleep 200', - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep "1m (down)"', - 'sleep 200', - # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --cancel', - 'sleep 200', - # Ensure the cluster is still UP. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP', - ], - f'sky down -y {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): - test = Test( - f'{cloud}-cancel-task', - [ - f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d', - # Wait the GPU process to start. - 'sleep 60', - f'sky exec {name} "nvidia-smi | grep python"', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky cancel -y {name} 1', - 'sleep 60', - # check if the python job is gone. - f'sky exec {name} "! nvidia-smi | grep python"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=timeout, - ) - return test - - -# ---------- Testing `sky cancel` ---------- -@pytest.mark.aws -def test_cancel_aws(): - name = _get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'aws') - run_one_test(test) - - -@pytest.mark.gcp -def test_cancel_gcp(): - name = _get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'gcp') - run_one_test(test) - - -@pytest.mark.azure -def test_cancel_azure(): - name = _get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_cancel_pytorch(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'cancel-pytorch', - [ - f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d', - # Wait the GPU process to start. - 'sleep 90', - f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || ' - # When run inside container/k8s, nvidia-smi cannot show process ids. - # See https://github.com/NVIDIA/nvidia-docker/issues/179 - # To work around, we check if GPU utilization is greater than 0. - f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky cancel -y {name} 1', - 'sleep 60', - f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || ' - # Ensure Xorg is the only process running. - '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - run_one_test(test) - - -# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi` -# requires a CUDA public image, which IBM doesn't offer -@pytest.mark.ibm -def test_cancel_ibm(): - name = _get_cluster_name() - test = Test( - 'ibm-cancel-task', - [ - f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml', - f'sky exec {name} -n {name}-1 -d "while true; do echo \'Hello SkyPilot\'; sleep 2; done"', - 'sleep 20', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky cancel -y {name} 2', - f'sleep 5', - f'sky queue {name} | grep {name}-1 | grep CANCELLED', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Testing use-spot option ---------- -@pytest.mark.no_fluidstack # FluidStack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -def test_use_spot(generic_cloud: str): - """Test use-spot and sky exec.""" - name = _get_cluster_name() - test = Test( - 'use-spot', - [ - f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y', - f'sky logs {name} 1 --status', - f'sky exec {name} echo hi', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_stop_gcp_spot(): - """Test GCP spot can be stopped, autostopped, restarted.""" - name = _get_cluster_name() - test = Test( - 'stop_gcp_spot', - [ - f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', - # stop should go through: - f'sky stop {name} -y', - f'sky start {name} -y', - f'sky exec {name} -- ls myfile', - f'sky logs {name} 2 --status', - f'sky autostop {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=90), - f'sky start {name} -y', - f'sky exec {name} -- ls myfile', - f'sky logs {name} 3 --status', - # -i option at launch should go through: - f'sky launch -c {name} -i0 -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, - timeout=120), - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Testing managed job ---------- -# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues -# when the controller being on Azure, which takes a long time for launching -# step. -@pytest.mark.managed_jobs -def test_managed_jobs(generic_cloud: str): - """Test the managed jobs yaml.""" - name = _get_cluster_name() - test = Test( - 'managed-jobs', - [ - f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-1', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', - timeout=60), - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', - timeout=60), - f'sky jobs cancel -y -n {name}-1', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-1', - job_status=f'{ManagedJobStatus.CANCELLED.value}', - timeout=230), - # Test the functionality for logging. - f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', - f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', - f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', - ], - # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when - # canceling multiple job names is supported. - f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_job_pipeline(generic_cloud: str): - """Test a job pipeline.""" - name = _get_cluster_name() - test = Test( - 'spot-pipeline', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', - # `grep -A 4 {name}` finds the job with {name} and the 4 lines - # after it, i.e. the 4 tasks within the job. - # `sed -n 2p` gets the second line of the 4 lines, i.e. the first - # task within the job. - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', - f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', - 'sleep 200', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_failed_setup(generic_cloud: str): - """Test managed job with failed setup.""" - name = _get_cluster_name() - test = Test( - 'managed_jobs_failed_setup', - [ - f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', - # Make sure the job failed quickly. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', - timeout=330 + _BUMP_UP_SECONDS), - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): - """Test managed job with failed setup for a pipeline.""" - name = _get_cluster_name() - test = Test( - 'managed_jobs_pipeline_failed_setup', - [ - f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', - timeout=600), - # Make sure the job failed quickly. - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', - # Task 0 should be SUCCEEDED. - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', - # Task 1 should be FAILED_SETUP. - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', - # Task 2 should be CANCELLED. - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - # Task 3 should be CANCELLED. - f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=30 * 60, - ) - run_one_test(test) - - -# ---------- Testing managed job recovery ---------- - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_aws(aws_config_region): - """Test managed job recovery.""" - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = Test( - 'managed_jobs_recovery_aws', - [ - f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=600), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the cluster manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_gcp(): - """Test managed job recovery.""" - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-east4-b' - query_cmd = ( - f'gcloud compute instances list --filter=' - # `:` means prefix match. - f'"(labels.ray-cluster-name:{name_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = Test( - 'managed_jobs_recovery_gcp', - [ - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=300), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the cluster manually. - terminate_cmd, - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_recovery_aws(aws_config_region): - """Test managed job recovery for a pipeline.""" - name = _get_cluster_name() - user_hash = common_utils.get_user_hash() - user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] - region = aws_config_region - if region != 'us-east-2': - pytest.skip('Only run spot pipeline recovery test in us-east-2') - test = Test( - 'managed_jobs_pipeline_recovery_aws', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', - # Terminate the cluster manually. - # The `cat ...| rev` is to retrieve the job_id from the - # SKYPILOT_TASK_ID, which gets the second to last field - # separated by `-`. - ( - f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - 'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;' - f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - # TODO(zhwu): fix the name for spot cluster. - '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}' - f'-{user_hash} ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', - f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', - f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_recovery_gcp(): - """Test managed job recovery for a pipeline.""" - name = _get_cluster_name() - zone = 'us-east4-b' - user_hash = common_utils.get_user_hash() - user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] - query_cmd = ( - 'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = Test( - 'managed_jobs_pipeline_recovery_gcp', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', - # Terminate the cluster manually. - # The `cat ...| rev` is to retrieve the job_id from the - # SKYPILOT_TASK_ID, which gets the second to last field - # separated by `-`. - (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', - f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', - f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_default_resources(generic_cloud: str): - """Test managed job recovery for default resources.""" - name = _get_cluster_name() - test = Test( - 'managed-spot-recovery-default-resources', - [ - f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', - timeout=360), - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_multi_node_aws(aws_config_region): - """Test managed job recovery.""" - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = Test( - 'managed_jobs_recovery_multi_node_aws', - [ - f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=450), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the worker manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' - 'Name=tag:ray-node-type,Values=worker ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=560), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_multi_node_gcp(): - """Test managed job recovery.""" - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-west2-a' - # Use ':' to match as the cluster name will contain the suffix with job id - query_cmd = ( - f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:{name_on_cloud} AND ' - f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = Test( - 'managed_jobs_recovery_multi_node_gcp', - [ - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the worker manually. - terminate_cmd, - _JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.RUNNING.value, - timeout=560), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_cancellation_aws(aws_config_region): - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - name_2_on_cloud = common_utils.make_cluster_name_on_cloud( - f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - name_3_on_cloud = common_utils.make_cluster_name_on_cloud( - f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = Test( - 'managed_jobs_cancellation_aws', - [ - # Test cancellation during spot cluster being launched. - f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status= - f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', - timeout=60 + _BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' - ), - # Test cancelling the spot cluster during spot job being setup. - f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - # The job is set up in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, - timeout=300 + _BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}-2', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' - ), - # Test cancellation during spot job is recovering. - f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', - # The job is running in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, - timeout=300 + _BUMP_UP_SECONDS), - # Terminate the cluster manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', - f'sky jobs cancel -y -n {name}-3', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because - # there can be multiple VM with the same name due to the recovery. - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"' - ), - ], - timeout=25 * 60) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_cancellation_gcp(): - name = _get_cluster_name() - name_3 = f'{name}-3' - name_3_on_cloud = common_utils.make_cluster_name_on_cloud( - name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-west3-b' - query_state_cmd = ( - 'gcloud compute instances list ' - f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" ' - '--format="value(status)"') - query_cmd = (f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:{name_3_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = Test( - 'managed_jobs_cancellation_gcp', - [ - # Test cancellation during spot cluster being launched. - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.STARTING.value, - timeout=60 + _BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - # Test cancelling the spot cluster during spot job being setup. - f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - # The job is set up in the cluster, will shown as RUNNING. - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, - timeout=300 + _BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}-2', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - # Test cancellation during spot job is recovering. - f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, - timeout=300 + _BUMP_UP_SECONDS), - # Terminate the cluster manually. - terminate_cmd, - _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', - f'sky jobs cancel -y -n {name}-3', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, - timeout=120 + _BUMP_UP_SECONDS), - # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because - # there can be multiple VM with the same name due to the recovery. - (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' - ), - ], - timeout=25 * 60) - run_one_test(test) - - -# ---------- Testing storage for managed job ---------- -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_storage(generic_cloud: str): - """Test storage with managed job""" - name = _get_cluster_name() - yaml_str = pathlib.Path( - 'examples/managed_job_with_storage.yaml').read_text() - timestamp = int(time.time()) - storage_name = f'sky-test-{timestamp}' - output_storage_name = f'sky-test-output-{timestamp}' - - # Also perform region testing for bucket creation to validate if buckets are - # created in the correct region and correctly mounted in managed jobs. - # However, we inject this testing only for AWS and GCP since they are the - # supported object storage providers in SkyPilot. - region_flag = '' - region_validation_cmd = 'true' - use_spot = ' --use-spot' - if generic_cloud == 'aws': - region = 'eu-central-1' - region_flag = f' --region {region}' - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.S3, bucket_name=storage_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.S3, output_storage_name, 'output.txt') - output_check_cmd = f'{s3_check_file_count} | grep 1' - elif generic_cloud == 'gcp': - region = 'us-west2' - region_flag = f' --region {region}' - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.GCS, bucket_name=storage_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.GCS, output_storage_name, 'output.txt') - output_check_cmd = f'{gcs_check_file_count} | grep 1' - elif generic_cloud == 'azure': - region = 'westus2' - region_flag = f' --region {region}' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name(region)) - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.AZURE, - storage_account_name=storage_account_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.AZURE, - output_storage_name, - 'output.txt', - storage_account_name=storage_account_name) - output_check_cmd = f'{az_check_file_count} | grep 1' - elif generic_cloud == 'kubernetes': - # With Kubernetes, we don't know which object storage provider is used. - # Check both S3 and GCS if bucket exists in either. - s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.S3, output_storage_name, 'output.txt') - s3_output_check_cmd = f'{s3_check_file_count} | grep 1' - gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.GCS, output_storage_name, 'output.txt') - gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1' - output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}' - use_spot = ' --no-use-spot' - - yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) - yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(yaml_str) - f.flush() - file_path = f.name - test = Test( - 'managed_jobs_storage', - [ - *STORAGE_SETUP_COMMANDS, - f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', - region_validation_cmd, # Check if the bucket is created in the correct region - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME. - format(job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, - timeout=60 + _BUMP_UP_SECONDS), - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', - # Check if file was written to the mounted output bucket - output_check_cmd - ], - (f'sky jobs cancel -y -n {name}', - f'; sky storage delete {output_storage_name} || true'), - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) - - -# ---------- Testing spot TPU ---------- -@pytest.mark.gcp -@pytest.mark.managed_jobs -@pytest.mark.tpu -def test_managed_jobs_tpu(): - """Test managed job on TPU.""" - name = _get_cluster_name() - test = Test( - 'test-spot-tpu', - [ - f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.STARTING.value, - timeout=60 + _BUMP_UP_SECONDS), - # TPU takes a while to launch - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', - timeout=900 + _BUMP_UP_SECONDS), - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) - - -# ---------- Testing env for managed jobs ---------- -@pytest.mark.managed_jobs -def test_managed_jobs_inline_env(generic_cloud: str): - """Test managed jobs env""" - name = _get_cluster_name() - test = Test( - 'test-managed-jobs-inline-env', - [ - f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( - job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, - timeout=20 + _BUMP_UP_SECONDS), - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - run_one_test(test) - - -# ---------- Testing env ---------- -def test_inline_env(generic_cloud: str): - """Test env""" - name = _get_cluster_name() - test = Test( - 'test-inline-env', - [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - 'sleep 20', - f'sky logs {name} 1 --status', - f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - _get_timeout(generic_cloud), - ) - run_one_test(test) - - -# ---------- Testing env file ---------- -def test_inline_env_file(generic_cloud: str): - """Test env""" - name = _get_cluster_name() - test = Test( - 'test-inline-env-file', - [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 1 --status', - f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - _get_timeout(generic_cloud), - ) - run_one_test(test) - - -# ---------- Testing custom image ---------- -@pytest.mark.aws -def test_aws_custom_image(): - """Test AWS custom image""" - name = _get_cluster_name() - test = Test( - 'test-aws-custom-image', - [ - f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a', # Nvidia image - f'sky logs {name} 1 --status', - ], - f'sky down -y {name}', - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.kubernetes -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test latest image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - ]) -def test_kubernetes_custom_image(image_id): - """Test Kubernetes custom image""" - name = _get_cluster_name() - test = Test( - 'test-kubernetes-custom-image', - [ - f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1', - f'sky logs {name} 1 --status', - # Try exec to run again and check if the logs are printed - f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"', - # Make sure ssh is working with custom username - f'ssh {name} echo hi | grep hi', - ], - f'sky down -y {name}', - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_start_stop_two_nodes(): - name = _get_cluster_name() - test = Test( - 'azure-start-stop-two-nodes', - [ - f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml', - f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f'sky start -y {name} -i 1', - f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status= - f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', - timeout=200 + _BUMP_UP_SECONDS) + - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' - ], - f'sky down -y {name}', - timeout=30 * 60, # 30 mins (it takes around ~23 mins) - ) - run_one_test(test) - - -# ---------- Testing env for disk tier ---------- -@pytest.mark.aws -def test_aws_disk_tier(): - - def _get_aws_query_command(region, instance_id, field, expected): - return (f'aws ec2 describe-volumes --region {region} ' - f'--filters Name=attachment.instance-id,Values={instance_id} ' - f'--query Volumes[*].{field} | grep {expected} ; ') - - for disk_tier in list(resources_utils.DiskTier): - specs = AWS._get_disk_specs(disk_tier) - name = _get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.AWS.max_cluster_name_length()) - region = 'us-east-2' - test = Test( - 'aws-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud aws --region {region} ' - f'--disk-tier {disk_tier.value} echo "hello sky"', - f'id=`aws ec2 describe-instances --region {region} --filters ' - f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query ' - f'Reservations[].Instances[].InstanceId --output text`; ' + - _get_aws_query_command(region, '$id', 'VolumeType', - specs['disk_tier']) + - ('' if specs['disk_tier'] - == 'standard' else _get_aws_query_command( - region, '$id', 'Iops', specs['disk_iops'])) + - ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command( - region, '$id', 'Throughput', specs['disk_throughput'])), - ], - f'sky down -y {name}', - timeout=10 * 60, # 10 mins (it takes around ~6 mins) - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_disk_tier(): - for disk_tier in list(resources_utils.DiskTier): - disk_types = [GCP._get_disk_type(disk_tier)] - name = _get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.GCP.max_cluster_name_length()) - region = 'us-west2' - instance_type_options = [''] - if disk_tier == resources_utils.DiskTier.BEST: - # Ultra disk tier requires n2 instance types to have more than 64 CPUs. - # If using default instance type, it will only enable the high disk tier. - disk_types = [ - GCP._get_disk_type(resources_utils.DiskTier.HIGH), - GCP._get_disk_type(resources_utils.DiskTier.ULTRA), - ] - instance_type_options = ['', '--instance-type n2-standard-64'] - for disk_type, instance_type_option in zip(disk_types, - instance_type_options): - test = Test( - 'gcp-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud gcp --region {region} ' - f'--disk-tier {disk_tier.value} {instance_type_option} ', - f'name=`gcloud compute instances list --filter=' - f'"labels.ray-cluster-name:{name_on_cloud}" ' - '--format="value(name)"`; ' - f'gcloud compute disks list --filter="name=$name" ' - f'--format="value(type)" | grep {disk_type} ' - ], - f'sky down -y {name}', - timeout=6 * 60, # 6 mins (it takes around ~3 mins) - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_disk_tier(): - for disk_tier in list(resources_utils.DiskTier): - if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: - # Azure does not support high and ultra disk tier. - continue - type = Azure._get_disk_type(disk_tier) - name = _get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.Azure.max_cluster_name_length()) - region = 'westus2' - test = Test( - 'azure-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud azure --region {region} ' - f'--disk-tier {disk_tier.value} echo "hello sky"', - f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' - f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' - f'--output tsv | grep {type}' - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins (it takes around ~12 mins) - ) - run_one_test(test) - - -@pytest.mark.azure -def test_azure_best_tier_failover(): - type = Azure._get_disk_type(resources_utils.DiskTier.LOW) - name = _get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.Azure.max_cluster_name_length()) - region = 'westus2' - test = Test( - 'azure-best-tier-failover', - [ - f'sky launch -y -c {name} --cloud azure --region {region} ' - f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"', - f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' - f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' - f'--output tsv | grep {type}', - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins (it takes around ~12 mins) - ) - run_one_test(test) - - -# ------ Testing Zero Quota Failover ------ -@pytest.mark.aws -def test_aws_zero_quota_failover(): - - name = _get_cluster_name() - region = get_aws_region_for_quota_failover() - - if not region: - pytest.xfail( - 'Unable to test zero quota failover optimization — quotas ' - 'for EC2 P3 instances were found on all AWS regions. Is this ' - 'expected for your account?') - return - - test = Test( - 'aws-zero-quota-failover', - [ - f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_zero_quota_failover(): - - name = _get_cluster_name() - region = get_gcp_region_for_quota_failover() - - if not region: - pytest.xfail( - 'Unable to test zero quota failover optimization — quotas ' - 'for A100-80GB GPUs were found on all GCP regions. Is this ' - 'expected for your account?') - return - - test = Test( - 'gcp-zero-quota-failover', - [ - f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -def test_long_setup_run_script(generic_cloud: str): - name = _get_cluster_name() - with tempfile.NamedTemporaryFile('w', prefix='sky_app_', - suffix='.yaml') as f: - f.write( - textwrap.dedent(""" \ - setup: | - echo "start long setup" - """)) - for i in range(1024 * 200): - f.write(f' echo {i}\n') - f.write(' echo "end long setup"\n') - f.write( - textwrap.dedent(""" \ - run: | - echo "run" - """)) - for i in range(1024 * 200): - f.write(f' echo {i}\n') - f.write(' echo "end run"\n') - f.flush() - - test = Test( - 'long-setup-run-script', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}', - f'sky exec {name} "echo hello"', - f'sky exec {name} {f.name}', - f'sky logs {name} --status 1', - f'sky logs {name} --status 2', - f'sky logs {name} --status 3', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Testing skyserve ---------- - - -def _get_service_name() -> str: - """Returns a user-unique service name for each test_skyserve_(). - - Must be called from each test_skyserve_(). - """ - caller_func_name = inspect.stack()[1][3] - test_name = caller_func_name.replace('_', '-').replace('test-', 't-') - test_name = test_name.replace('skyserve-', 'ss-') - test_name = common_utils.make_cluster_name_on_cloud(test_name, 24) - return f'{test_name}-{test_id}' - - -# We check the output of the skyserve service to see if it is ready. Output of -# `REPLICAS` is in the form of `1/2` where the first number is the number of -# ready replicas and the second number is the number of total replicas. We -# grep such format to ensure that the service is ready, and early exit if any -# failure detected. In the end we sleep for -# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have -# enough time to sync with the controller and get all ready replica IPs. -_SERVE_WAIT_UNTIL_READY = ( - '{{ while true; do' - ' s=$(sky serve status {name}); echo "$s";' - ' echo "$s" | grep -q "{replica_num}/{replica_num}" && break;' - ' echo "$s" | grep -q "FAILED" && exit 1;' - ' sleep 10;' - ' done; }}; echo "Got service status $s";' - f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};') -_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' -_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' -_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING' -# Since we don't allow terminate the service if the controller is INIT, -# which is common for simultaneous pytest, we need to wait until the -# controller is UP before we can terminate the service. -# The teardown command has a 10-mins timeout, so we don't need to do -# the timeout here. See implementation of run_one_test() for details. -_TEARDOWN_SERVICE = ( - '(for i in `seq 1 20`; do' - ' s=$(sky serve down -y {name});' - ' echo "Trying to terminate {name}";' - ' echo "$s";' - ' echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;' - ' sleep 10;' - ' [ $i -eq 20 ] && echo "Failed to terminate service {name}";' - 'done)') - -_SERVE_ENDPOINT_WAIT = ( - 'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' - 'endpoint=$(sky serve status --endpoint {name}); ' - 'until ! echo "$endpoint" | grep "Controller is initializing"; ' - 'do echo "Waiting for serve endpoint to be ready..."; ' - 'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; ' - 'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"') - -_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); ' - 'until ! echo "$s" | grep "Controller is initializing."; ' - 'do echo "Waiting for serve status to be ready..."; ' - 'sleep 5; s=$(sky serve status {name}); done; echo "$s"') - - -def _get_replica_ip(name: str, replica_id: int) -> str: - return (f'ip{replica_id}=$(echo "$s" | ' - f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | ' - f'grep -E "{name}\s+{replica_id}" | ' - f'grep -Eo "{_IP_REGEX}")') - - -def _get_skyserve_http_test(name: str, cloud: str, - timeout_minutes: int) -> Test: - test = Test( - f'test-skyserve-{cloud.replace("_", "-")}', - [ - f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=timeout_minutes * 60, - ) - return test - - -def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool, - str]]) -> str: - """Check replicas' status and count in sky serve status - - We will check vCPU=2, as all our tests use vCPU=2. - - Args: - name: the name of the service - check_tuples: A list of replica property to check. Each tuple is - (count, is_spot, status) - """ - check_cmd = '' - for check_tuple in check_tuples: - count, is_spot, status = check_tuple - resource_str = '' - if status not in ['PENDING', 'SHUTTING_DOWN' - ] and not status.startswith('FAILED'): - spot_str = '' - if is_spot: - spot_str = '\[Spot\]' - resource_str = f'({spot_str}vCPU=2)' - check_cmd += (f' echo "$s" | grep "{resource_str}" | ' - f'grep "{status}" | wc -l | grep {count} || exit 1;') - return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd) - - -def _check_service_version(service_name: str, version: str) -> str: - # Grep the lines before 'Service Replicas' and check if the service version - # is correct. - return (f'echo "$s" | grep -B1000 "Service Replicas" | ' - f'grep -E "{service_name}\s+{version}" || exit 1; ') - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_gcp_http(): - """Test skyserve on GCP""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'gcp', 20) - run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.serve -def test_skyserve_aws_http(): - """Test skyserve on AWS""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'aws', 20) - run_one_test(test) - - -@pytest.mark.azure -@pytest.mark.serve -def test_skyserve_azure_http(): - """Test skyserve on Azure""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'azure', 30) - run_one_test(test) - - -@pytest.mark.kubernetes -@pytest.mark.serve -def test_skyserve_kubernetes_http(): - """Test skyserve on Kubernetes""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'kubernetes', 30) - run_one_test(test) - - -@pytest.mark.oci -@pytest.mark.serve -def test_skyserve_oci_http(): - """Test skyserve on OCI""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'oci', 20) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now -@pytest.mark.serve -def test_skyserve_llm(generic_cloud: str): - """Test skyserve with real LLM usecase""" - name = _get_service_name() - - def generate_llm_test_command(prompt: str, expected_output: str) -> str: - prompt = shlex.quote(prompt) - expected_output = shlex.quote(expected_output) - return ( - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'python tests/skyserve/llm/get_response.py --endpoint $endpoint ' - f'--prompt {prompt} | grep {expected_output}') - - with open('tests/skyserve/llm/prompt_output.json', 'r', - encoding='utf-8') as f: - prompt2output = json.load(f) - - test = Test( - f'test-skyserve-llm', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - *[ - generate_llm_test_command(prompt, output) - for prompt, output in prompt2output.items() - ], - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=40 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_spot_recovery(): - name = _get_service_name() - zone = 'us-central1-a' - - test = Test( - f'test-skyserve-spot-recovery-gcp', - [ - f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - _terminate_gcp_replica(name, zone, 1), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.serve -@pytest.mark.no_kubernetes -def test_skyserve_base_ondemand_fallback(generic_cloud: str): - name = _get_service_name() - test = Test( - f'test-skyserve-base-ondemand-fallback', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(1, True, 'READY'), - (1, False, 'READY')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_dynamic_ondemand_fallback(): - name = _get_service_name() - zone = 'us-central1-a' - - test = Test( - f'test-skyserve-dynamic-ondemand-fallback', - [ - f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml', - f'sleep 40', - # 2 on-demand (provisioning) + 2 Spot (provisioning). - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";' - 'echo "$s" | grep -q "0/4" || exit 1', - # Wait for the provisioning starts - f'sleep 40', - _check_replica_in_status(name, [ - (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), - (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN') - ]), - - # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(2, True, 'READY'), - (0, False, '')]), - _terminate_gcp_replica(name, zone, 1), - f'sleep 40', - # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning). - f'{_SERVE_STATUS_WAIT.format(name=name)}; ' - 'echo "$s" | grep -q "1/3"', - _check_replica_in_status( - name, [(1, True, 'READY'), - (1, True, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), - - # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(2, True, 'READY'), - (0, False, '')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_user_bug_restart(generic_cloud: str): - """Tests that we restart the service after user bug.""" - # TODO(zhwu): this behavior needs some rethinking. - name = _get_service_name() - test = Test( - f'test-skyserve-user-bug-restart', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml', - f's=$(sky serve status {name}); echo "$s";' - 'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; ' - 'do echo "Waiting for first service to be SHUTTING DOWN..."; ' - f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ', - f's=$(sky serve status {name}); echo "$s";' - 'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; ' - 'do echo "Waiting for first service to be FAILED..."; ' - f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; ' - + _check_replica_in_status(name, [(1, True, 'FAILED')]) + - # User bug failure will cause no further scaling. - f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; ' - f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; ' - + _check_replica_in_status(name, [(1, False, 'READY'), - (1, False, 'FAILED')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -@pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP -def test_skyserve_load_balancer(generic_cloud: str): - """Test skyserve load balancer round-robin policy""" - name = _get_service_name() - test = Test( - f'test-skyserve-load-balancer', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - f'{_SERVE_STATUS_WAIT.format(name=name)}; ' - f'{_get_replica_ip(name, 1)}; ' - f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' - 'python tests/skyserve/load_balancer/test_round_robin.py ' - '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -@pytest.mark.no_kubernetes -def test_skyserve_auto_restart(): - """Test skyserve with auto restart""" - name = _get_service_name() - zone = 'us-central1-a' - test = Test( - f'test-skyserve-auto-restart', - [ - # TODO(tian): we can dynamically generate YAML from template to - # avoid maintaining too many YAML files - f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - # sleep for 20 seconds (initial delay) to make sure it will - # be restarted - f'sleep 20', - _terminate_gcp_replica(name, zone, 1), - # Wait for consecutive failure timeout passed. - # If the cluster is not using spot, it won't check the cluster status - # on the cloud (since manual shutdown is not a common behavior and such - # queries takes a lot of time). Instead, we think continuous 3 min probe - # failure is not a temporary problem but indeed a failure. - 'sleep 180', - # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time - # that the output of `sky serve status` shows FAILED and this status will - # cause _SERVE_WAIT_UNTIL_READY to early quit. - '(while true; do' - f' output=$(sky serve status {name});' - ' echo "$output" | grep -q "1/1" && break;' - ' sleep 10;' - f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_cancel(generic_cloud: str): - """Test skyserve with cancel""" - name = _get_service_name() - - test = Test( - f'test-skyserve-cancel', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 ' - 'tests/skyserve/cancel/send_cancel_request.py ' - '--endpoint $endpoint | grep "Request was cancelled"', - f's=$(sky serve logs {name} 1 --no-follow); ' - 'until ! echo "$s" | grep "Please wait for the controller to be"; ' - 'do echo "Waiting for serve logs"; sleep 10; ' - f's=$(sky serve logs {name} 1 --no-follow); done; ' - 'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_streaming(generic_cloud: str): - """Test skyserve with streaming""" - name = _get_service_name() - test = Test( - f'test-skyserve-streaming', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'python3 tests/skyserve/streaming/send_streaming_request.py ' - '--endpoint $endpoint | grep "Streaming test passed"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_readiness_timeout_fail(generic_cloud: str): - """Test skyserve with large readiness probe latency, expected to fail""" - name = _get_service_name() - test = Test( - f'test-skyserve-readiness-timeout-fail', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml', - # None of the readiness probe will pass, so the service will be - # terminated after the initial delay. - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - 'sleep 60', - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;' - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_large_readiness_timeout(generic_cloud: str): - """Test skyserve with customized large readiness timeout""" - name = _get_service_name() - test = Test( - f'test-skyserve-large-readiness-timeout', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_update(generic_cloud: str): - """Test skyserve with update""" - name = _get_service_name() - test = Test( - f'test-skyserve-update', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml', - # sleep before update is registered. - 'sleep 20', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;' - # Make sure the traffic is not mixed - 'curl http://$endpoint | grep "Hi, new SkyPilot here"', - # The latest 2 version should be READY and the older versions should be shutting down - (_check_replica_in_status(name, [(2, False, 'READY'), - (2, False, 'SHUTTING_DOWN')]) + - _check_service_version(name, "2")), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_rolling_update(generic_cloud: str): - """Test skyserve with rolling update""" - name = _get_service_name() - single_new_replica = _check_replica_in_status( - name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, 'SHUTTING_DOWN')]) - test = Test( - f'test-skyserve-rolling-update', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml', - # Make sure the traffic is mixed across two versions, the replicas - # with even id will sleep 60 seconds before being ready, so we - # should be able to get observe the period that the traffic is mixed - # across two versions. - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; ' - # The latest version should have one READY and the one of the older versions should be shutting down - f'{single_new_replica} {_check_service_version(name, "1,2")} ' - # Check the output from the old version, immediately after the - # output from the new version appears. This is guaranteed by the - # round robin load balancing policy. - # TODO(zhwu): we should have a more generalized way for checking the - # mixed version of replicas to avoid depending on the specific - # round robin load balancing policy. - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_fast_update(generic_cloud: str): - """Test skyserve with fast update (Increment version of old replicas)""" - name = _get_service_name() - - test = Test( - f'test-skyserve-fast-update', - [ - f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', - # sleep to wait for update to be registered. - 'sleep 40', - # 2 on-deamnd (ready) + 1 on-demand (provisioning). - ( - _check_replica_in_status( - name, [(2, False, 'READY'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) + - # Fast update will directly have the latest version ready. - _check_service_version(name, "2")), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) + - _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - # Test rolling update - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', - # sleep to wait for update to be registered. - 'sleep 25', - # 2 on-deamnd (ready) + 1 on-demand (shutting down). - _check_replica_in_status(name, [(2, False, 'READY'), - (1, False, 'SHUTTING_DOWN')]), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_update_autoscale(generic_cloud: str): - """Test skyserve update with autoscale""" - name = _get_service_name() - test = Test( - f'test-skyserve-update-autoscale', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml', - # sleep before update is registered. - 'sleep 20', - # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) + - _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here!"', - # Rolling Update - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', - # sleep before update is registered. - 'sleep 20', - # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here!"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=30 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack -@pytest.mark.serve -@pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes -@pytest.mark.parametrize('mode', ['rolling', 'blue_green']) -def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): - """Test skyserve with update that changes autoscaler""" - name = f'{_get_service_name()}-{mode}' - - wait_until_no_pending = ( - f's=$(sky serve status {name}); echo "$s"; ' - 'until ! echo "$s" | grep PENDING; do ' - ' echo "Waiting for replica to be out of pending..."; ' - f' sleep 5; s=$(sky serve status {name}); ' - ' echo "$s"; ' - 'done') - four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')]) - update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;'] - if mode == 'rolling': - # Check rolling update, it will terminate one of the old on-demand - # instances, once there are 4 spot instance ready. - update_check += [ - _check_replica_in_status( - name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) + - _check_service_version(name, "1,2"), - ] - else: - # Check blue green update, it will keep both old on-demand instances - # running, once there are 4 spot instance ready. - update_check += [ - _check_replica_in_status( - name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (2, False, 'READY')]) + - _check_service_version(name, "1"), - ] - test = Test( - f'test-skyserve-new-autoscaler-update-{mode}', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml', - # Wait for update to be registered - f'sleep 90', - wait_until_no_pending, - _check_replica_in_status( - name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (2, False, 'READY')]), - *update_check, - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - _check_replica_in_status(name, [(4, True, 'READY'), - (1, False, 'READY')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_failures(generic_cloud: str): - """Test replica failure statuses""" - name = _get_service_name() - - test = Test( - 'test-skyserve-failures', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml', - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - 'sleep 60', - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; ' - # Make sure no new replicas are started for early failure. - f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml', - f's=$(sky serve status {name}); ' - # Wait for replica to be ready. - f'until echo "$s" | grep "READY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - # Wait for replica to change to FAILED_PROBING - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_PROBING"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done', - # Wait for the PENDING replica to appear. - 'sleep 10', - # Wait until the replica is out of PENDING. - f's=$(sky serve status {name}); ' - f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do ' - 'echo "Waiting for replica to be out of pending..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done; ' + - _check_replica_in_status( - name, [(1, False, 'FAILED_PROBING'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), - # TODO(zhwu): add test for FAILED_PROVISION - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - run_one_test(test) - - -# TODO(Ziming, Tian): Add tests for autoscaling. - - -# ------- Testing user dependencies -------- -def test_user_dependencies(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'user-dependencies', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"', - f'sky logs {name} 1 --status', - f'sky exec {name} "echo hi"', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - f'sky exec {name} "echo bye"', - f'sky logs {name} 3 --status', - f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml', - f'sky logs {name} 4 --status', - # Launch again to test the default env does not affect SkyPilot - # runtime setup - f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"', - f'sky logs {name} 5 --status', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ------- Testing the core API -------- -# Most of the core APIs have been tested in the CLI tests. -# These tests are for testing the return value of the APIs not fully used in CLI. - - -@pytest.mark.gcp -def test_core_api_sky_launch_exec(): - name = _get_cluster_name() - task = sky.Task(run="whoami") - task.set_resources(sky.Resources(cloud=sky.GCP())) - job_id, handle = sky.launch(task, cluster_name=name) - assert job_id == 1 - assert handle is not None - assert handle.cluster_name == name - assert handle.launched_resources.cloud.is_same_cloud(sky.GCP()) - job_id_exec, handle_exec = sky.exec(task, cluster_name=name) - assert job_id_exec == 2 - assert handle_exec is not None - assert handle_exec.cluster_name == name - assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP()) - # For dummy task (i.e. task.run is None), the job won't be submitted. - dummy_task = sky.Task() - job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name) - assert job_id_dummy is None - sky.down(name) - - -# The sky launch CLI has some additional checks to make sure the cluster is up/ -# restarted. However, the core API doesn't have these; make sure it still works -def test_core_api_sky_launch_fast(generic_cloud: str): - name = _get_cluster_name() - cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud) - try: - task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud)) - sky.launch(task, - cluster_name=name, - idle_minutes_to_autostop=1, - fast=True) - # Sleep to let the cluster autostop - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( - cluster_name=name, - cluster_status=ClusterStatus.STOPPED, - timeout=120) - # Run it again - should work with fast=True - sky.launch(task, - cluster_name=name, - idle_minutes_to_autostop=1, - fast=True) - finally: - sky.down(name) - - -# ---------- Testing Storage ---------- -class TestStorageWithCredentials: - """Storage tests which require credentials and network connection""" - - AWS_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - '192.168.5.4', # formatted as an IP address - 'xn--bucket', # starts with 'xn--' prefix - 'bucket-s3alias', # ends with '-s3alias' suffix - 'bucket--ol-s3', # ends with '--ol-s3' suffix - '.abc', # starts with a dot - 'abc.', # ends with a dot - '-abc', # starts with a hyphen - 'abc-', # ends with a hyphen - ] - - GCS_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters (without dots) - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1' - # More than 63 characters between dots - 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5, - # more than 222 characters (with dots) - '192.168.5.4', # formatted as an IP address - 'googbucket', # starts with 'goog' prefix - 'googlebucket', # contains 'google' - 'g00glebucket', # variant of 'google' - 'go0glebucket', # variant of 'google' - 'g0oglebucket', # variant of 'google' - '.abc', # starts with a dot - 'abc.', # ends with a dot - '_abc', # starts with an underscore - 'abc_', # ends with an underscore - ] - - AZURE_INVALID_NAMES = [ - 'ab', # less than 3 characters - # more than 63 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - 'Abcdef', # contains an uppercase letter - '.abc', # starts with a non-letter(dot) - 'a--bc', # contains consecutive hyphens - ] - - IBM_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - '192.168.5.4', # formatted as an IP address - 'xn--bucket', # starts with 'xn--' prefix - '.abc', # starts with a dot - 'abc.', # ends with a dot - '-abc', # starts with a hyphen - 'abc-', # ends with a hyphen - 'a.-bc', # contains the sequence '.-' - 'a-.bc', # contains the sequence '-.' - 'a&bc' # contains special characters - 'ab^c' # contains special characters - ] - GITIGNORE_SYNC_TEST_DIR_STRUCTURE = { - 'double_asterisk': { - 'double_asterisk_excluded': None, - 'double_asterisk_excluded_dir': { - 'dir_excluded': None, - }, - }, - 'double_asterisk_parent': { - 'parent': { - 'also_excluded.txt': None, - 'child': { - 'double_asterisk_parent_child_excluded.txt': None, - }, - 'double_asterisk_parent_excluded.txt': None, - }, - }, - 'excluded.log': None, - 'excluded_dir': { - 'excluded.txt': None, - 'nested_excluded': { - 'excluded': None, - }, - }, - 'exp-1': { - 'be_excluded': None, - }, - 'exp-2': { - 'be_excluded': None, - }, - 'front_slash_excluded': None, - 'included.log': None, - 'included.txt': None, - 'include_dir': { - 'excluded.log': None, - 'included.log': None, - }, - 'nested_double_asterisk': { - 'one': { - 'also_exclude.txt': None, - }, - 'two': { - 'also_exclude.txt': None, - }, - }, - 'nested_wildcard_dir': { - 'monday': { - 'also_exclude.txt': None, - }, - 'tuesday': { - 'also_exclude.txt': None, - }, - }, - 'no_slash_excluded': None, - 'no_slash_tests': { - 'no_slash_excluded': { - 'also_excluded.txt': None, - }, - }, - 'question_mark': { - 'excluded1.txt': None, - 'excluded@.txt': None, - }, - 'square_bracket': { - 'excluded1.txt': None, - }, - 'square_bracket_alpha': { - 'excludedz.txt': None, - }, - 'square_bracket_excla': { - 'excluded2.txt': None, - 'excluded@.txt': None, - }, - 'square_bracket_single': { - 'excluded0.txt': None, - }, - } - - @staticmethod - def create_dir_structure(base_path, structure): - # creates a given file STRUCTURE in BASE_PATH - for name, substructure in structure.items(): - path = os.path.join(base_path, name) - if substructure is None: - # Create a file - open(path, 'a', encoding='utf-8').close() - else: - # Create a subdirectory - os.mkdir(path) - TestStorageWithCredentials.create_dir_structure( - path, substructure) - - @staticmethod - def cli_delete_cmd(store_type, - bucket_name, - storage_account_name: str = None): - if store_type == storage_lib.StoreType.S3: - url = f's3://{bucket_name}' - return f'aws s3 rb {url} --force' - if store_type == storage_lib.StoreType.GCS: - url = f'gs://{bucket_name}' - gsutil_alias, alias_gen = data_utils.get_gsutil_command() - return f'{alias_gen}; {gsutil_alias} rm -r {url}' - if store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage container delete ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} ' - f'--name {bucket_name}') - if store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - url = f's3://{bucket_name}' - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2' - if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}' - - @staticmethod - def cli_ls_cmd(store_type, bucket_name, suffix=''): - if store_type == storage_lib.StoreType.S3: - if suffix: - url = f's3://{bucket_name}/{suffix}' - else: - url = f's3://{bucket_name}' - return f'aws s3 ls {url}' - if store_type == storage_lib.StoreType.GCS: - if suffix: - url = f'gs://{bucket_name}/{suffix}' - else: - url = f'gs://{bucket_name}' - return f'gsutil ls {url}' - if store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - config_storage_account = skypilot_config.get_nested( - ('azure', 'storage_account'), None) - storage_account_name = config_storage_account if ( - config_storage_account is not None) else ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - list_cmd = ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--prefix {shlex.quote(suffix)} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key}') - return list_cmd - if store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - if suffix: - url = f's3://{bucket_name}/{suffix}' - else: - url = f's3://{bucket_name}' - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' - if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}' - - @staticmethod - def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None): - if store_type == storage_lib.StoreType.S3: - assert bucket_name is not None - return ('aws s3api get-bucket-location ' - f'--bucket {bucket_name} --output text') - elif store_type == storage_lib.StoreType.GCS: - assert bucket_name is not None - return (f'gsutil ls -L -b gs://{bucket_name}/ | ' - 'grep "Location constraint" | ' - 'awk \'{print tolower($NF)}\'') - elif store_type == storage_lib.StoreType.AZURE: - # For Azure Blob Storage, the location of the containers are - # determined by the location of storage accounts. - assert storage_account_name is not None - return (f'az storage account show --name {storage_account_name} ' - '--query "primaryLocation" --output tsv') - else: - raise NotImplementedError(f'Region command not implemented for ' - f'{store_type}') - - @staticmethod - def cli_count_name_in_bucket(store_type, - bucket_name, - file_name, - suffix='', - storage_account_name=None): - if store_type == storage_lib.StoreType.S3: - if suffix: - return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' - else: - return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' - elif store_type == storage_lib.StoreType.GCS: - if suffix: - return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l' - else: - return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l' - elif store_type == storage_lib.StoreType.AZURE: - if storage_account_name is None: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--prefix {shlex.quote(suffix)} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} | ' - f'grep {file_name} | ' - 'wc -l') - elif store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - if suffix: - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' - else: - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' - - @staticmethod - def cli_count_file_in_bucket(store_type, bucket_name): - if store_type == storage_lib.StoreType.S3: - return f'aws s3 ls s3://{bucket_name} --recursive | wc -l' - elif store_type == storage_lib.StoreType.GCS: - return f'gsutil ls -r gs://{bucket_name}/** | wc -l' - elif store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} | ' - 'grep \\"name\\": | ' - 'wc -l') - elif store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l' - - @pytest.fixture - def tmp_source(self, tmp_path): - # Creates a temporary directory with a file in it - tmp_dir = tmp_path / 'tmp-source' - tmp_dir.mkdir() - tmp_file = tmp_dir / 'tmp-file' - tmp_file.write_text('test') - circle_link = tmp_dir / 'circle-link' - circle_link.symlink_to(tmp_dir, target_is_directory=True) - yield str(tmp_dir) - - @staticmethod - def generate_bucket_name(): - # Creates a temporary bucket name - # time.time() returns varying precision on different systems, so we - # replace the decimal point and use whatever precision we can get. - timestamp = str(time.time()).replace('.', '') - return f'sky-test-{timestamp}' - - @pytest.fixture - def tmp_bucket_name(self): - yield self.generate_bucket_name() - - @staticmethod - def yield_storage_object( - name: Optional[str] = None, - source: Optional[storage_lib.Path] = None, - stores: Optional[Dict[storage_lib.StoreType, - storage_lib.AbstractStore]] = None, - persistent: Optional[bool] = True, - mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT): - # Creates a temporary storage object. Stores must be added in the test. - storage_obj = storage_lib.Storage(name=name, - source=source, - stores=stores, - persistent=persistent, - mode=mode) - yield storage_obj - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - # If handle exists, delete manually - # TODO(romilb): This is potentially risky - if the delete method has - # bugs, this can cause resource leaks. Ideally we should manually - # eject storage from global_user_state and delete the bucket using - # boto3 directly. - storage_obj.delete() - - @pytest.fixture - def tmp_scratch_storage_obj(self, tmp_bucket_name): - # Creates a storage object with no source to create a scratch storage. - # Stores must be added in the test. - yield from self.yield_storage_object(name=tmp_bucket_name) - - @pytest.fixture - def tmp_multiple_scratch_storage_obj(self): - # Creates a list of 5 storage objects with no source to create - # multiple scratch storages. - # Stores for each object in the list must be added in the test. - storage_mult_obj = [] - for _ in range(5): - timestamp = str(time.time()).replace('.', '') - store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}') - storage_mult_obj.append(store_obj) - yield storage_mult_obj - for storage_obj in storage_mult_obj: - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - # If handle exists, delete manually - # TODO(romilb): This is potentially risky - if the delete method has - # bugs, this can cause resource leaks. Ideally we should manually - # eject storage from global_user_state and delete the bucket using - # boto3 directly. - storage_obj.delete() - - @pytest.fixture - def tmp_multiple_custom_source_storage_obj(self): - # Creates a list of storage objects with custom source names to - # create multiple scratch storages. - # Stores for each object in the list must be added in the test. - custom_source_names = ['"path With Spaces"', 'path With Spaces'] - storage_mult_obj = [] - for name in custom_source_names: - src_path = os.path.expanduser(f'~/{name}') - pathlib.Path(src_path).expanduser().mkdir(exist_ok=True) - timestamp = str(time.time()).replace('.', '') - store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}', - source=src_path) - storage_mult_obj.append(store_obj) - yield storage_mult_obj - for storage_obj in storage_mult_obj: - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - storage_obj.delete() - - @pytest.fixture - def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source): - # Creates a temporary storage object. Stores must be added in the test. - yield from self.yield_storage_object(name=tmp_bucket_name, - source=tmp_source) - - @pytest.fixture - def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source): - # Creates a temp storage object which uses a list of paths as source. - # Stores must be added in the test. After upload, the bucket should - # have two files - /tmp-file and /tmp-source/tmp-file - list_source = [tmp_source, tmp_source + '/tmp-file'] - yield from self.yield_storage_object(name=tmp_bucket_name, - source=list_source) - - @pytest.fixture - def tmp_bulk_del_storage_obj(self, tmp_bucket_name): - # Creates a temporary storage object for testing bulk deletion. - # Stores must be added in the test. - with tempfile.TemporaryDirectory() as tmpdir: - subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}', - shell=True) - subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt', - shell=True) - subprocess.check_output( - f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True) - yield from self.yield_storage_object(name=tmp_bucket_name, - source=tmpdir) - - @pytest.fixture - def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj): - # Creates a copy mount storage which reuses an existing storage object. - tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3) - storage_name = tmp_scratch_storage_obj.name - - # Try to initialize another storage with the storage object created - # above, but now in COPY mode. This should succeed. - yield from self.yield_storage_object(name=storage_name, - mode=storage_lib.StorageMode.COPY) - - @pytest.fixture - def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure): - # Creates a temporary storage object for testing .gitignore filter. - # GITIGINORE_STRUCTURE is representing a file structure in a dictionary - # format. Created storage object will contain the file structure along - # with .gitignore and .git/info/exclude files to test exclude filter. - # Stores must be added in the test. - with tempfile.TemporaryDirectory() as tmpdir: - # Creates file structure to be uploaded in the Storage - self.create_dir_structure(tmpdir, gitignore_structure) - - # Create .gitignore and list files/dirs to be excluded in it - skypilot_path = os.path.dirname(os.path.dirname(sky.__file__)) - temp_path = f'{tmpdir}/.gitignore' - file_path = os.path.join(skypilot_path, 'tests/gitignore_test') - shutil.copyfile(file_path, temp_path) - - # Create .git/info/exclude and list files/dirs to be excluded in it - temp_path = f'{tmpdir}/.git/info/' - os.makedirs(temp_path) - temp_exclude_path = os.path.join(temp_path, 'exclude') - file_path = os.path.join(skypilot_path, - 'tests/git_info_exclude_test') - shutil.copyfile(file_path, temp_exclude_path) - - # Create sky Storage with the files created - yield from self.yield_storage_object( - name=tmp_bucket_name, - source=tmpdir, - mode=storage_lib.StorageMode.COPY) - - @pytest.fixture - def tmp_awscli_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using awscli - bucket_uri = f's3://{tmp_bucket_name}' - subprocess.check_call(['aws', 's3', 'mb', bucket_uri]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force']) - - @pytest.fixture - def tmp_gsutil_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using gsutil - bucket_uri = f'gs://{tmp_bucket_name}' - subprocess.check_call(['gsutil', 'mb', bucket_uri]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri]) - - @pytest.fixture - def tmp_az_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using gsutil - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - bucket_uri = data_utils.AZURE_CONTAINER_URL.format( - storage_account_name=storage_account_name, - container_name=tmp_bucket_name) - subprocess.check_call([ - 'az', 'storage', 'container', 'create', '--name', - f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', - '--account-key', f'{storage_account_key}' - ]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call([ - 'az', 'storage', 'container', 'delete', '--name', - f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', - '--account-key', f'{storage_account_key}' - ]) - - @pytest.fixture - def tmp_awscli_bucket_r2(self, tmp_bucket_name): - # Creates a temporary bucket using awscli - endpoint_url = cloudflare.create_endpoint() - bucket_uri = f's3://{tmp_bucket_name}' - subprocess.check_call( - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2', - shell=True) - yield tmp_bucket_name, bucket_uri - subprocess.check_call( - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2', - shell=True) - - @pytest.fixture - def tmp_ibm_cos_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using IBM COS API - storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name) - yield tmp_bucket_name - storage_obj.delete() - - @pytest.fixture - def tmp_public_storage_obj(self, request): - # Initializes a storage object with a public bucket - storage_obj = storage_lib.Storage(source=request.param) - yield storage_obj - # This does not require any deletion logic because it is a public bucket - # and should not get added to global_user_state. - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, - store_type): - # Creates a new bucket with a local source, uploads files to it - # and deletes it. - tmp_local_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_local_storage_obj.name in out.decode('utf-8') - - # Run sky storage delete to delete the storage object - subprocess.check_output( - ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes']) - - # Run sky storage ls to check if storage object is deleted - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_local_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.xdist_group('multiple_bucket_deletion') - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm) - ]) - def test_multiple_buckets_creation_and_deletion( - self, tmp_multiple_scratch_storage_obj, store_type): - # Creates multiple new buckets(5 buckets) with a local source - # and deletes them. - storage_obj_name = [] - for store_obj in tmp_multiple_scratch_storage_obj: - store_obj.add_store(store_type) - storage_obj_name.append(store_obj.name) - - # Run sky storage ls to check if all storage objects exists in the - # output filtered by store type - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item in out for item in storage_obj_name]) - - # Run sky storage delete all to delete all storage objects - delete_cmd = ['sky', 'storage', 'delete', '--yes'] - delete_cmd += storage_obj_name - subprocess.check_output(delete_cmd) - - # Run sky storage ls to check if all storage objects filtered by store - # type are deleted - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item not in out for item in storage_obj_name]) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_upload_source_with_spaces(self, store_type, - tmp_multiple_custom_source_storage_obj): - # Creates two buckets with specified local sources - # with spaces in the name - storage_obj_names = [] - for storage_obj in tmp_multiple_custom_source_storage_obj: - storage_obj.add_store(store_type) - storage_obj_names.append(storage_obj.name) - - # Run sky storage ls to check if all storage objects exists in the - # output filtered by store type - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item in out for item in storage_obj_names]) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_bucket_external_deletion(self, tmp_scratch_storage_obj, - store_type): - # Creates a bucket, deletes it externally using cloud cli commands - # and then tries to delete it using sky storage delete. - tmp_scratch_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_scratch_storage_obj.name in out.decode('utf-8') - - # Delete bucket externally - cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name) - subprocess.check_output(cmd, shell=True) - - # Run sky storage delete to delete the storage object - out = subprocess.check_output( - ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes']) - # Make sure bucket was not created during deletion (see issue #1322) - assert 'created' not in out.decode('utf-8').lower() - - # Run sky storage ls to check if storage object is deleted - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_scratch_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj): - # Creates a temp folder with over 256 files and folders, upload - # files and folders to a new bucket, then delete bucket. - tmp_bulk_del_storage_obj.add_store(store_type) - - subprocess.check_output([ - 'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes' - ]) - - output = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'tmp_public_storage_obj, store_type', - [('s3://tcga-2-open', storage_lib.StoreType.S3), - ('s3://digitalcorpora', storage_lib.StoreType.S3), - ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS), - pytest.param( - 'https://azureopendatastorage.blob.core.windows.net/nyctlc', - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure)], - indirect=['tmp_public_storage_obj']) - def test_public_bucket(self, tmp_public_storage_obj, store_type): - # Creates a new bucket with a public source and verifies that it is not - # added to global_user_state. - tmp_public_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_public_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'nonexist_bucket_url', - [ - 's3://{random_name}', - 'gs://{random_name}', - pytest.param( - 'https://{account_name}.blob.core.windows.net/{random_name}', # pylint: disable=line-too-long - marks=pytest.mark.azure), - pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm), - pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare) - ]) - def test_nonexistent_bucket(self, nonexist_bucket_url): - # Attempts to create fetch a stroage with a non-existent source. - # Generate a random bucket name and verify it doesn't exist: - retry_count = 0 - while True: - nonexist_bucket_name = str(uuid.uuid4()) - if nonexist_bucket_url.startswith('s3'): - command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}' - expected_output = '404' - elif nonexist_bucket_url.startswith('gs'): - command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}' - expected_output = 'BucketNotFoundException' - elif nonexist_bucket_url.startswith('https'): - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}' - expected_output = '"exists": false' - elif nonexist_bucket_url.startswith('r2'): - endpoint_url = cloudflare.create_endpoint() - command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2' - expected_output = '404' - elif nonexist_bucket_url.startswith('cos'): - # Using API calls, since using rclone requires a profile's name - try: - expected_output = command = "echo" # avoid unrelated exception in case of failure. - bucket_name = urllib.parse.urlsplit( - nonexist_bucket_url.format( - random_name=nonexist_bucket_name)).path.strip('/') - client = ibm.get_cos_client('us-east') - client.head_bucket(Bucket=bucket_name) - except ibm.ibm_botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == '404': - # success - return - else: - raise ValueError('Unsupported bucket type ' - f'{nonexist_bucket_url}') - - # Check if bucket exists using the cli: - try: - out = subprocess.check_output(command, - stderr=subprocess.STDOUT, - shell=True) - except subprocess.CalledProcessError as e: - out = e.output - out = out.decode('utf-8') - if expected_output in out: - break - else: - retry_count += 1 - if retry_count > 3: - raise RuntimeError('Unable to find a nonexistent bucket ' - 'to use. This is higly unlikely - ' - 'check if the tests are correct.') - - with pytest.raises(sky.exceptions.StorageBucketGetError, - match='Attempted to use a non-existent'): - if nonexist_bucket_url.startswith('https'): - storage_obj = storage_lib.Storage( - source=nonexist_bucket_url.format( - account_name=storage_account_name, - random_name=nonexist_bucket_name)) - else: - storage_obj = storage_lib.Storage( - source=nonexist_bucket_url.format( - random_name=nonexist_bucket_name)) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'private_bucket', - [ - f's3://imagenet', - f'gs://imagenet', - pytest.param('https://smoketestprivate.blob.core.windows.net/test', - marks=pytest.mark.azure), # pylint: disable=line-too-long - pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm) - ]) - def test_private_bucket(self, private_bucket): - # Attempts to access private buckets not belonging to the user. - # These buckets are known to be private, but may need to be updated if - # they are removed by their owners. - store_type = urllib.parse.urlsplit(private_bucket).scheme - if store_type == 'https' or store_type == 'cos': - private_bucket_name = urllib.parse.urlsplit( - private_bucket).path.strip('/') - else: - private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc - with pytest.raises( - sky.exceptions.StorageBucketGetError, - match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format( - name=private_bucket_name)): - storage_obj = storage_lib.Storage(source=private_bucket) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('ext_bucket_fixture, store_type', - [('tmp_awscli_bucket', storage_lib.StoreType.S3), - ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), - pytest.param('tmp_az_bucket', - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure), - pytest.param('tmp_ibm_cos_bucket', - storage_lib.StoreType.IBM, - marks=pytest.mark.ibm), - pytest.param('tmp_awscli_bucket_r2', - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_upload_to_existing_bucket(self, ext_bucket_fixture, request, - tmp_source, store_type): - # Tries uploading existing files to newly created bucket (outside of - # sky) and verifies that files are written. - bucket_name, _ = request.getfixturevalue(ext_bucket_fixture) - storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source) - storage_obj.add_store(store_type) - - # Check if tmp_source/tmp-file exists in the bucket using aws cli - out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - # Check symlinks - symlinks don't get copied by sky storage - assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), ( - 'circle-link was not found in the upload source - ' - 'are the test fixtures correct?') - assert 'circle-link' not in out.decode('utf-8'), ( - 'Symlink found in bucket - ls output was : {}'.format( - out.decode('utf-8'))) - - # Run sky storage ls to check if storage object exists in the output. - # It should not exist because the bucket was created externally. - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - def test_copy_mount_existing_storage(self, - tmp_copy_mnt_existing_storage_obj): - # Creates a bucket with no source in MOUNT mode (empty bucket), and - # then tries to load the same storage in COPY mode. - tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3) - storage_name = tmp_copy_mnt_existing_storage_obj.name - - # Check `sky storage ls` to ensure storage object exists - out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8') - assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.' - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_list_source(self, tmp_local_list_storage_obj, store_type): - # Uses a list in the source field to specify a file and a directory to - # be uploaded to the storage object. - tmp_local_list_storage_obj.add_store(store_type) - - # Check if tmp-file exists in the bucket root using cli - out = subprocess.check_output(self.cli_ls_cmd( - store_type, tmp_local_list_storage_obj.name), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - # Check if tmp-file exists in the bucket/tmp-source using cli - out = subprocess.check_output(self.cli_ls_cmd( - store_type, tmp_local_list_storage_obj.name, 'tmp-source/'), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('invalid_name_list, store_type', - [(AWS_INVALID_NAMES, storage_lib.StoreType.S3), - (GCS_INVALID_NAMES, storage_lib.StoreType.GCS), - pytest.param(AZURE_INVALID_NAMES, - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure), - pytest.param(IBM_INVALID_NAMES, - storage_lib.StoreType.IBM, - marks=pytest.mark.ibm), - pytest.param(AWS_INVALID_NAMES, - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_invalid_names(self, invalid_name_list, store_type): - # Uses a list in the source field to specify a file and a directory to - # be uploaded to the storage object. - for name in invalid_name_list: - with pytest.raises(sky.exceptions.StorageNameError): - storage_obj = storage_lib.Storage(name=name) - storage_obj.add_store(store_type) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'gitignore_structure, store_type', - [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3), - (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS), - (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE), - pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, - store_type, - tmp_gitignore_storage_obj): - # tests if files included in .gitignore and .git/info/exclude are - # excluded from being transferred to Storage - - tmp_gitignore_storage_obj.add_store(store_type) - - upload_file_name = 'included' - # Count the number of files with the given file name - up_cmd = self.cli_count_name_in_bucket(store_type, \ - tmp_gitignore_storage_obj.name, file_name=upload_file_name) - git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \ - tmp_gitignore_storage_obj.name, file_name='.git') - cnt_num_file_cmd = self.cli_count_file_in_bucket( - store_type, tmp_gitignore_storage_obj.name) - - up_output = subprocess.check_output(up_cmd, shell=True) - git_exclude_output = subprocess.check_output(git_exclude_cmd, - shell=True) - cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True) - - assert '3' in up_output.decode('utf-8'), \ - 'Files to be included are not completely uploaded.' - # 1 is read as .gitignore is uploaded - assert '1' in git_exclude_output.decode('utf-8'), \ - '.git directory should not be uploaded.' - # 4 files include .gitignore, included.log, included.txt, include_dir/included.log - assert '4' in cnt_output.decode('utf-8'), \ - 'Some items listed in .gitignore and .git/info/exclude are not excluded.' - - @pytest.mark.parametrize('ext_bucket_fixture, store_type', - [('tmp_awscli_bucket', storage_lib.StoreType.S3), - ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), - pytest.param('tmp_awscli_bucket_r2', - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_externally_created_bucket_mount_without_source( - self, ext_bucket_fixture, request, store_type): - # Non-sky managed buckets(buckets created outside of Skypilot CLI) - # are allowed to be MOUNTed by specifying the URI of the bucket to - # source field only. When it is attempted by specifying the name of - # the bucket only, it should error out. - # - # TODO(doyoung): Add test for IBM COS. Currently, this is blocked - # as rclone used to interact with IBM COS does not support feature to - # create a bucket, and the ibmcloud CLI is not supported in Skypilot. - # Either of the feature is necessary to simulate an external bucket - # creation for IBM COS. - # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837 - - ext_bucket_name, ext_bucket_uri = request.getfixturevalue( - ext_bucket_fixture) - # invalid spec - with pytest.raises(sky.exceptions.StorageSpecError) as e: - storage_obj = storage_lib.Storage( - name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT) - storage_obj.add_store(store_type) - - assert 'Attempted to mount a non-sky managed bucket' in str(e) - - # valid spec - storage_obj = storage_lib.Storage(source=ext_bucket_uri, - mode=storage_lib.StorageMode.MOUNT) - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - storage_obj.delete() - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('region', [ - 'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1', - 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1', - 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1', - 'us-east-2', 'us-west-1', 'us-west-2' - ]) - def test_aws_regions(self, tmp_local_storage_obj, region): - # This tests creation and upload to bucket in all AWS s3 regions - # To test full functionality, use test_managed_jobs_storage above. - store_type = storage_lib.StoreType.S3 - tmp_local_storage_obj.add_store(store_type, region=region) - bucket_name = tmp_local_storage_obj.name - - # Confirm that the bucket was created in the correct region - region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) - out = subprocess.check_output(region_cmd, shell=True) - output = out.decode('utf-8') - expected_output_region = region - if region == 'us-east-1': - expected_output_region = 'None' # us-east-1 is the default region - assert expected_output_region in out.decode('utf-8'), ( - f'Bucket was not found in region {region} - ' - f'output of {region_cmd} was: {output}') - - # Check if tmp_source/tmp-file exists in the bucket using cli - ls_cmd = self.cli_ls_cmd(store_type, bucket_name) - out = subprocess.check_output(ls_cmd, shell=True) - output = out.decode('utf-8') - assert 'tmp-file' in output, ( - f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('region', [ - 'northamerica-northeast1', 'northamerica-northeast2', 'us-central1', - 'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2', - 'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1', - 'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1', - 'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6', - 'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12', - 'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2', - 'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2', - 'asia-southeast2', 'me-central1', 'me-central2', 'me-west1', - 'australia-southeast1', 'australia-southeast2', 'africa-south1' - ]) - def test_gcs_regions(self, tmp_local_storage_obj, region): - # This tests creation and upload to bucket in all GCS regions - # To test full functionality, use test_managed_jobs_storage above. - store_type = storage_lib.StoreType.GCS - tmp_local_storage_obj.add_store(store_type, region=region) - bucket_name = tmp_local_storage_obj.name - - # Confirm that the bucket was created in the correct region - region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) - out = subprocess.check_output(region_cmd, shell=True) - output = out.decode('utf-8') - assert region in out.decode('utf-8'), ( - f'Bucket was not found in region {region} - ' - f'output of {region_cmd} was: {output}') - - # Check if tmp_source/tmp-file exists in the bucket using cli - ls_cmd = self.cli_ls_cmd(store_type, bucket_name) - out = subprocess.check_output(ls_cmd, shell=True) - output = out.decode('utf-8') - assert 'tmp-file' in output, ( - f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') - - -# ---------- Testing YAML Specs ---------- -# Our sky storage requires credentials to check the bucket existance when -# loading a task from the yaml file, so we cannot make it a unit test. -class TestYamlSpecs: - # TODO(zhwu): Add test for `to_yaml_config` for the Storage object. - # We should not use `examples/storage_demo.yaml` here, since it requires - # users to ensure bucket names to not exist and/or be unique. - _TEST_YAML_PATHS = [ - 'examples/minimal.yaml', 'examples/managed_job.yaml', - 'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml', - 'examples/multi_hostname.yaml' - ] - - def _is_dict_subset(self, d1, d2): - """Check if d1 is the subset of d2.""" - for k, v in d1.items(): - if k not in d2: - if isinstance(v, list) or isinstance(v, dict): - assert len(v) == 0, (k, v) - else: - assert False, (k, v) - elif isinstance(v, dict): - assert isinstance(d2[k], dict), (k, v, d2) - self._is_dict_subset(v, d2[k]) - elif isinstance(v, str): - if k == 'accelerators': - resources = sky.Resources() - resources._set_accelerators(v, None) - assert resources.accelerators == d2[k], (k, v, d2) - else: - assert v.lower() == d2[k].lower(), (k, v, d2[k]) - else: - assert v == d2[k], (k, v, d2[k]) - - def _check_equivalent(self, yaml_path): - """Check if the yaml is equivalent after load and dump again.""" - origin_task_config = common_utils.read_yaml(yaml_path) - - task = sky.Task.from_yaml(yaml_path) - new_task_config = task.to_yaml_config() - # d1 <= d2 - print(origin_task_config, new_task_config) - self._is_dict_subset(origin_task_config, new_task_config) - - def test_load_dump_yaml_config_equivalent(self): - """Test if the yaml config is equivalent after load and dump again.""" - pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/tmpfile').expanduser().touch() - pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch() - pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True, - exist_ok=True) - for yaml_path in self._TEST_YAML_PATHS: - self._check_equivalent(yaml_path) - - -# ---------- Testing Multiple Accelerators ---------- -@pytest.mark.no_fluidstack # Fluidstack does not support K80 gpus for now -@pytest.mark.no_paperspace # Paperspace does not support K80 gpus -def test_multiple_accelerators_ordered(): - name = _get_cluster_name() - test = Test( - 'multiple-accelerators-ordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_ordered_with_default(): - name = _get_cluster_name() - test = Test( - 'multiple-accelerators-ordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status {name} | grep Spot', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_unordered(): - name = _get_cluster_name() - test = Test( - 'multiple-accelerators-unordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_unordered_with_default(): - name = _get_cluster_name() - test = Test( - 'multiple-accelerators-unordered-with-default', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status {name} | grep Spot', - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -@pytest.mark.no_fluidstack # Requires other clouds to be enabled -def test_multiple_resources(): - name = _get_cluster_name() - test = Test( - 'multiple-resources', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - run_one_test(test) - - -# ---------- Sky Benchmark ---------- -@pytest.mark.no_fluidstack # Requires other clouds to be enabled -@pytest.mark.no_paperspace # Requires other clouds to be enabled -@pytest.mark.no_kubernetes -@pytest.mark.aws # SkyBenchmark requires S3 access -def test_sky_bench(generic_cloud: str): - name = _get_cluster_name() - test = Test( - 'sky-bench', - [ - f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml', - 'sleep 120', - f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED', - ], - f'sky bench down {name} -y; sky bench delete {name} -y', - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_context_failover(): - """Test if the kubernetes context failover works. - - This test requires two kubernetes clusters: - - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs. - - another accessible cluster: with enough CPUs - To start the first cluster, run: - sky local up - # Add mock label for accelerator - kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot - # Get the token for the cluster in context kind-skypilot - TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\') - # Get the API URL for the cluster in context kind-skypilot - API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\') - # Add mock capacity for GPU - curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status" - # Add a new namespace to test the handling of namespaces - kubectl create namespace test-namespace --context kind-skypilot - # Set the namespace to test-namespace - kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot - """ - # Get context that is not kind-skypilot - contexts = subprocess.check_output('kubectl config get-contexts -o name', - shell=True).decode('utf-8').split('\n') - context = [context for context in contexts if context != 'kind-skypilot'][0] - config = textwrap.dedent(f"""\ - kubernetes: - allowed_contexts: - - kind-skypilot - - {context} - """) - with tempfile.NamedTemporaryFile(delete=True) as f: - f.write(config.encode('utf-8')) - f.flush() - name = _get_cluster_name() - test = Test( - 'kubernetes-context-failover', - [ - # Check if kind-skypilot is provisioned with H100 annotations already - 'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && ' - 'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && ' - 'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || ' - '{ echo "kind-skypilot does not exist ' - 'or does not have mock labels for GPUs. Check the instructions in ' - 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', - # Check namespace for kind-skypilot is test-namespace - 'kubectl get namespaces --context kind-skypilot | grep test-namespace || ' - '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in ' - 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', - 'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"', - # Get contexts and set current context to the other cluster that is not kind-skypilot - f'kubectl config use-context {context}', - # H100 should not in the current context - '! sky show-gpus --cloud kubernetes | grep H100', - f'sky launch -y -c {name}-1 --cpus 1 echo hi', - f'sky logs {name}-1 --status', - # It should be launched not on kind-skypilot - f'sky status -a {name}-1 | grep "{context}"', - # Test failure for launching H100 on other cluster - f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true', - # Test failover - f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi', - f'sky logs {name}-3 --status', - # Test pods - f'kubectl get pods --context kind-skypilot | grep "{name}-3"', - # It should be launched on kind-skypilot - f'sky status -a {name}-3 | grep "kind-skypilot"', - # Should be 7 free GPUs - f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep " 7"', - # Remove the line with "kind-skypilot" - f'sed -i "/kind-skypilot/d" {f.name}', - # Should still be able to exec and launch on existing cluster - f'sky exec {name}-3 "echo hi"', - f'sky logs {name}-3 --status', - f'sky status -r {name}-3 | grep UP', - f'sky launch -c {name}-3 --gpus h100 echo hi', - f'sky logs {name}-3 --status', - f'sky status -r {name}-3 | grep UP', - ], - f'sky down -y {name}-1 {name}-3', - env={'SKYPILOT_CONFIG': f.name}, - ) - run_one_test(test) diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index 322c19a266e..37b61caa328 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -1,44 +1,22 @@ -import enum import inspect -import json import os -import pathlib -import shlex -import shutil import subprocess import sys import tempfile -import textwrap -import time from typing import Dict, List, NamedTuple, Optional, Tuple -import urllib.parse import uuid import colorama -import jinja2 import pytest import sky -from sky import global_user_state -from sky import jobs from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm from sky.clouds import AWS -from sky.clouds import Azure from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone from sky.jobs.state import ManagedJobStatus -from sky.skylet import constants -from sky.skylet import events from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus from sky.utils import common_utils -from sky.utils import resources_utils from sky.utils import subprocess_utils # To avoid the second smoke test reusing the cluster launched in the first @@ -64,9 +42,9 @@ # Get the job queue, and print it once on its own, then print it again to # use with grep by the caller. -_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"' +GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"' # Wait for a job to be not in RUNNING state. Used to check for RECOVERING. -_JOB_WAIT_NOT_RUNNING = ( +JOB_WAIT_NOT_RUNNING = ( 's=$(sky jobs queue);' 'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do ' 'sleep 10; s=$(sky jobs queue);' @@ -78,7 +56,7 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) -_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( +WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. 'start_time=$SECONDS; ' @@ -97,9 +75,9 @@ 'done') -def _get_cmd_wait_until_cluster_status_contains_wildcard( +def get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard: str, cluster_status: str, timeout: int): - wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( + wait_cmd = WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', 'awk "/^{cluster_name_awk}/') @@ -110,7 +88,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( timeout=timeout) -_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( +WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout 'start_time=$SECONDS; ' 'while true; do ' @@ -124,7 +102,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( 'sleep 10; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( +WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( # A while loop to wait until the job status # contains certain status, with timeout. 'start_time=$SECONDS; ' @@ -149,15 +127,15 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( 'sleep 10; ' 'done') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "') -_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') # Managed job functions -_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( +WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( 'sky queue {cluster_name}', 'sky jobs queue').replace( 'awk "\\$2 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( @@ -166,7 +144,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard( # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by # _BUMP_UP_SECONDS before exiting. -_BUMP_UP_SECONDS = 35 +BUMP_UP_SECONDS = 35 DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -191,13 +169,13 @@ def echo(self, message: str): print(message, file=sys.stderr, flush=True) -def _get_timeout(generic_cloud: str, - override_timeout: int = DEFAULT_CMD_TIMEOUT): +def get_timeout(generic_cloud: str, + override_timeout: int = DEFAULT_CMD_TIMEOUT): timeouts = {'fluidstack': 60 * 60} # file_mounts return timeouts.get(generic_cloud, override_timeout) -def _get_cluster_name() -> str: +def get_cluster_name() -> str: """Returns a user-unique cluster name for each test_(). Must be called from each test_(). @@ -210,7 +188,7 @@ def _get_cluster_name() -> str: return f'{test_name}-{test_id}' -def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str: +def terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str: cluster_name = serve.generate_replica_cluster_name(name, replica_id) query_cmd = (f'gcloud compute instances list --filter=' f'"(labels.ray-cluster-name:{cluster_name})" ' @@ -352,7 +330,7 @@ def get_gcp_region_for_quota_failover() -> Optional[str]: return None -_VALIDATE_LAUNCH_OUTPUT = ( +VALIDATE_LAUNCH_OUTPUT = ( # Validate the output of the job submission: # ⚙️ Launching on Kubernetes. # Pod is up. diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 00000000000..d1dc2129422 --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,36 @@ +# Smoke tests for SkyPilot +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/test_smoke.py +# +# Terminate failed clusters after test finishes +# > pytest tests/test_smoke.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/test_smoke.py::test_minimal +# +# Only run managed job tests +# > pytest tests/test_smoke.py --managed-jobs +# +# Only run sky serve tests +# > pytest tests/test_smoke.py --sky-serve +# +# Only run test for AWS + generic tests +# > pytest tests/test_smoke.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/test_smoke.py --generic-cloud aws + +# All files categorized under tests/smoke_tests/* +# Please add new test cases under that directory. +from smoke_tests.test_basic import * +from smoke_tests.test_cluster_job import * +from smoke_tests.test_images import * +from smoke_tests.test_managed_job import * +from smoke_tests.test_mount_and_storage import * +from smoke_tests.test_region_and_zone import * +from smoke_tests.test_sky_serve import * diff --git a/tests/test_yamls/minimal_test_required_before_merge.yaml b/tests/test_yamls/minimal_test_required_before_merge.yaml new file mode 100644 index 00000000000..aceb5a76cb0 --- /dev/null +++ b/tests/test_yamls/minimal_test_required_before_merge.yaml @@ -0,0 +1,13 @@ +resources: + cloud: aws + instance_type: t3.small + +file_mounts: + ~/aws: . + +workdir: . + +num_nodes: 1 + +run: | + ls -l ~/aws/tests/test_yamls/minimal_test_required_before_merge.yaml From e11a7d123328db6dc486a6323cde2968b00f4380 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 22 Nov 2024 15:40:34 +0800 Subject: [PATCH 26/64] remove unsupport cloud for now --- .buildkite/generate_pipeline.py | 16 +++- .buildkite/pipeline_smoke_test_basic.yaml | 9 -- .../pipeline_smoke_test_cluster_job.yaml | 87 ------------------- .../pipeline_smoke_test_managed_job.yaml | 33 ------- ...pipeline_smoke_test_mount_and_storage.yaml | 25 ------ .../pipeline_smoke_test_region_and_zone.yaml | 8 -- .buildkite/pipeline_smoke_test_sky_serve.yaml | 73 ---------------- 7 files changed, 15 insertions(+), 236 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index b363c695057..cb135b41a61 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -7,6 +7,10 @@ import yaml DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] +# We only have credentials for aws, azure, and gcp. +# For those test cases that run on other clouds, +# we currently ignore them. +ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp'] def _get_full_decorator_path(decorator: ast.AST) -> str: @@ -59,6 +63,16 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: cloud for cloud in clouds_to_include if cloud not in clouds_to_exclude ] + final_clouds_to_include = [ + cloud for cloud in clouds_to_include + if cloud in ALL_CLOUDS_WITH_CREDENTIALS + ] + if clouds_to_include and not final_clouds_to_include: + print(f'Warning: {file_path}:{node.name} ' + f'is marked to run on {clouds_to_include}, ' + f'but we do not have credentials for those clouds. ' + f'Skipped.') + continue function_name = (f'{class_name}::{node.name}' if class_name else node.name) function_cloud_map[function_name] = (clouds_to_include) @@ -100,7 +114,7 @@ def main(): '.buildkite/generate_pipeline.py, Please do not ' 'edit directly.\n') yaml.dump(pipeline, file, default_flow_style=False) - print(f'Convert {test_file_path} to {yaml_file_path}') + print(f'Convert {test_file_path} to {yaml_file_path}\n\n') if __name__ == '__main__': diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml index 9c775c1f5fb..d0ba641c48c 100644 --- a/.buildkite/pipeline_smoke_test_basic.yaml +++ b/.buildkite/pipeline_smoke_test_basic.yaml @@ -39,10 +39,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_cli_logs on aws -- command: pytest tests/smoke_tests/test_basic.py::test_scp_logs --scp - env: - LOG_TO_STDOUT: '1' - label: test_scp_logs on scp - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp env: LOG_TO_STDOUT: '1' @@ -79,11 +75,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_sky_bench on aws -- command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_kubernetes_context_failover on kubernetes - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent --aws env: diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml index 3b81274a00a..8a813119eb2 100644 --- a/.buildkite/pipeline_smoke_test_cluster_job.yaml +++ b/.buildkite/pipeline_smoke_test_cluster_job.yaml @@ -9,18 +9,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_job_queue_with_docker on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --lambda_cloud - env: - LOG_TO_STDOUT: '1' - label: test_lambda_job_queue on lambda_cloud -- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue --ibm - env: - LOG_TO_STDOUT: '1' - label: test_ibm_job_queue on ibm -- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_job_queue --scp - env: - LOG_TO_STDOUT: '1' - label: test_scp_job_queue on scp - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode --aws env: @@ -35,11 +23,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_fast_large_job_queue on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue_multinode - --ibm - env: - LOG_TO_STDOUT: '1' - label: test_ibm_job_queue_multinode on ibm - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package --aws env: @@ -53,14 +36,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_huggingface on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --lambda_cloud - env: - LOG_TO_STDOUT: '1' - label: test_lambda_huggingface on lambda_cloud -- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_huggingface --scp - env: - LOG_TO_STDOUT: '1' - label: test_scp_huggingface on scp - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws env: LOG_TO_STDOUT: '1' @@ -77,10 +52,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_tpu_vm_pod on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_tpu_pod_slice_gke on kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws env: LOG_TO_STDOUT: '1' @@ -104,21 +75,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_azure_http_server_with_custom_ports on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_kubernetes_http_server_with_custom_ports on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_paperspace_http_server_with_custom_ports - --paperspace - env: - LOG_TO_STDOUT: '1' - label: test_paperspace_http_server_with_custom_ports on paperspace -- command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports - --runpod - env: - LOG_TO_STDOUT: '1' - label: test_runpod_http_server_with_custom_ports on runpod - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws env: LOG_TO_STDOUT: '1' @@ -127,36 +83,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_task_labels_gcp on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_task_labels_kubernetes on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_container_logs_multinode_kubernetes on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_container_logs_two_jobs_kubernetes on kubernetes -- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws env: LOG_TO_STDOUT: '1' @@ -177,10 +103,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_autodown on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_autodown --scp - env: - LOG_TO_STDOUT: '1' - label: test_scp_autodown on scp - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws env: LOG_TO_STDOUT: '1' @@ -197,10 +119,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_cancel_pytorch on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_ibm --ibm - env: - LOG_TO_STDOUT: '1' - label: test_cancel_ibm on ibm - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws env: LOG_TO_STDOUT: '1' @@ -221,11 +139,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_aws_custom_image on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_kubernetes_custom_image on kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes --azure env: diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml index cda2b87a53c..fee2ae1f3c8 100644 --- a/.buildkite/pipeline_smoke_test_managed_job.yaml +++ b/.buildkite/pipeline_smoke_test_managed_job.yaml @@ -1,23 +1,5 @@ # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. steps: -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs on managed_jobs -- command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_job_pipeline on managed_jobs -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup - --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_failed_setup on managed_jobs -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup - --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_pipeline_failed_setup on managed_jobs - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws --aws env: @@ -38,11 +20,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_managed_jobs_pipeline_recovery_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources - --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_recovery_default_resources on managed_jobs - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws --aws env: @@ -63,17 +40,7 @@ steps: env: LOG_TO_STDOUT: '1' label: test_managed_jobs_cancellation_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage - --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_storage on managed_jobs - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp env: LOG_TO_STDOUT: '1' label: test_managed_jobs_tpu on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env - --managed_jobs - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_inline_env on managed_jobs diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml index 6f1d11e7804..01f8739dd79 100644 --- a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml +++ b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml @@ -4,11 +4,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_file_mounts on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_scp_file_mounts - --scp - env: - LOG_TO_STDOUT: '1' - label: test_scp_file_mounts on scp - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars --aws env: @@ -29,31 +24,11 @@ steps: env: LOG_TO_STDOUT: '1' label: test_azure_storage_mounts_with_stop on azure -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_kubernetes_storage_mounts on kubernetes -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_kubernetes_context_switch on kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts --aws env: LOG_TO_STDOUT: '1' label: test_docker_storage_mounts on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_cloudflare_storage_mounts - --cloudflare - env: - LOG_TO_STDOUT: '1' - label: test_cloudflare_storage_mounts on cloudflare -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_ibm_storage_mounts - --ibm - env: - LOG_TO_STDOUT: '1' - label: test_ibm_storage_mounts on ibm - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion --aws env: diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml index ae38eb4b594..aa955bc1864 100644 --- a/.buildkite/pipeline_smoke_test_region_and_zone.yaml +++ b/.buildkite/pipeline_smoke_test_region_and_zone.yaml @@ -14,10 +14,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_gcp_region_and_service_account on gcp -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_region --ibm - env: - LOG_TO_STDOUT: '1' - label: test_ibm_region on ibm - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure env: LOG_TO_STDOUT: '1' @@ -26,10 +22,6 @@ steps: env: LOG_TO_STDOUT: '1' label: test_aws_zone on aws -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_zone --ibm - env: - LOG_TO_STDOUT: '1' - label: test_ibm_zone on ibm - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp env: LOG_TO_STDOUT: '1' diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml index 0fd84641780..4cd4d35aa4d 100644 --- a/.buildkite/pipeline_smoke_test_sky_serve.yaml +++ b/.buildkite/pipeline_smoke_test_sky_serve.yaml @@ -12,94 +12,21 @@ steps: env: LOG_TO_STDOUT: '1' label: test_skyserve_azure_http on azure -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http - --kubernetes - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_kubernetes_http on kubernetes -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_oci_http --oci - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_oci_http on oci -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_llm on serve - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery --gcp env: LOG_TO_STDOUT: '1' label: test_skyserve_spot_recovery on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_base_ondemand_fallback on serve - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback --gcp env: LOG_TO_STDOUT: '1' label: test_skyserve_dynamic_ondemand_fallback on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_user_bug_restart on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_load_balancer on serve - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart --gcp env: LOG_TO_STDOUT: '1' label: test_skyserve_auto_restart on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_cancel on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_streaming on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_readiness_timeout_fail on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_large_readiness_timeout on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_update on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_rolling_update on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_fast_update on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_update_autoscale on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_new_autoscaler_update on serve -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --serve - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_failures on serve - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws env: LOG_TO_STDOUT: '1' From 8a651508b960343ec147d0be548977cd589b2edf Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 12:05:32 +0800 Subject: [PATCH 27/64] merge branch 'reliable_smoke_test_more' --- tests/smoke_tests/test_basic.py | 29 ++-- tests/smoke_tests/test_cluster_job.py | 48 +++--- tests/smoke_tests/test_images.py | 14 +- tests/smoke_tests/test_managed_job.py | 153 +++++++++--------- tests/smoke_tests/test_region_and_zone.py | 12 +- .../smoke_tests/test_required_before_merge.py | 7 +- tests/smoke_tests/util.py | 83 ++++++++-- 7 files changed, 209 insertions(+), 137 deletions(-) diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index 0090ae957b8..1f76254b67d 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -27,13 +27,14 @@ import pytest from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_cmd_wait_until_cluster_status_contains +from smoke_tests.util import ( + get_cmd_wait_until_job_status_contains_without_matching_job) from smoke_tests.util import get_timeout from smoke_tests.util import run_one_test from smoke_tests.util import SCP_TYPE from smoke_tests.util import Test from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT -from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB import sky from sky.skylet import events @@ -142,9 +143,9 @@ def test_launch_fast_with_autostop(generic_cloud: str): f'sky status -r {name} | grep UP', # Ensure cluster is stopped - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Launch again. Do full output validation - we expect the cluster to re-launch @@ -170,9 +171,9 @@ def test_stale_job(generic_cloud: str): f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=100), f'sky start {name} -y', f'sky logs {name} 1 --status', @@ -201,17 +202,17 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=40), f'sky launch -c {name} -y "echo hi"', f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, + job_status=[JobStatus.FAILED_DRIVER], timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), ], f'sky down -y {name}', @@ -242,9 +243,9 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, - job_status=JobStatus.FAILED_DRIVER.value, + job_status=[JobStatus.FAILED_DRIVER], timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) ], f'sky down -y {name}', @@ -354,9 +355,9 @@ def test_core_api_sky_launch_fast(generic_cloud: str): idle_minutes_to_autostop=1, fast=True) # Sleep to let the cluster autostop - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED, + cluster_status=[ClusterStatus.STOPPED], timeout=120) # Run it again - should work with fast=True sky.launch(task, diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index 22b6d9dc8f0..5fce0c2208c 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -28,6 +28,9 @@ from smoke_tests.util import BUMP_UP_SECONDS from smoke_tests.util import get_aws_region_for_quota_failover from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_cmd_wait_until_cluster_status_contains +from smoke_tests.util import ( + get_cmd_wait_until_job_status_contains_matching_job_id) from smoke_tests.util import get_gcp_region_for_quota_failover from smoke_tests.util import get_timeout from smoke_tests.util import LAMBDA_TYPE @@ -35,8 +38,6 @@ from smoke_tests.util import SCP_GPU_V100 from smoke_tests.util import SCP_TYPE from smoke_tests.util import Test -from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS -from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID import sky from sky import AWS @@ -419,10 +420,10 @@ def test_multi_echo(generic_cloud: str): ] + # Ensure jobs succeeded. [ - WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, job_id=i + 1, - job_status=JobStatus.SUCCEEDED.value, + job_status=[JobStatus.SUCCEEDED], timeout=120) for i in range(32) ] + # Ensure monitor/autoscaler didn't crash on the 'assert not @@ -996,17 +997,16 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT], timeout=200), ], f'sky down -y {name}', @@ -1030,10 +1030,9 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})', + cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT], timeout=280) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', ], @@ -1071,9 +1070,9 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Ensure the cluster is UP and the autostop setting is reset ('-'). @@ -1090,9 +1089,9 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), # Test restarting the idleness timer via exec: @@ -1102,9 +1101,9 @@ def test_autostop(generic_cloud: str): 'sleep 45', # Almost reached the threshold. f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout + BUMP_UP_SECONDS), ], f'sky down -y {name}', @@ -1322,18 +1321,18 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=90), f'sky start {name} -y', f'sky exec {name} -- ls myfile', f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=120), ], f'sky down -y {name}', @@ -1439,10 +1438,9 @@ def test_azure_start_stop_two_nodes(): f'sky start -y {name} -i 1', f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. - WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status= - f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})', + cluster_status=[ClusterStatus.INIT, ClusterStatus.STOPPED], timeout=200 + BUMP_UP_SECONDS) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py index e2e4c440b89..b66211d016d 100644 --- a/tests/smoke_tests/test_images.py +++ b/tests/smoke_tests/test_images.py @@ -20,9 +20,9 @@ # > pytest tests/smoke_tests/test_images.py --generic-cloud aws import pytest -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND -from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS from smoke_tests.util import get_cluster_name +from smoke_tests.util import get_cmd_wait_until_cluster_is_not_found +from smoke_tests.util import get_cmd_wait_until_cluster_status_contains from smoke_tests.util import run_one_test from smoke_tests.util import Test @@ -279,9 +279,9 @@ def test_clone_disk_aws(): f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=60), # Wait for EC2 instance to be in stopped state. # TODO: event based wait. @@ -331,7 +331,7 @@ def test_gcp_mig(): # Check MIG exists. f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', f'sky autostop -i 0 --down -y {name}', - _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name, + get_cmd_wait_until_cluster_is_not_found(cluster_name=name, timeout=120), f'gcloud compute instance-templates list | grep "sky-it-{name}"', # Launch again with the same region. The original instance template @@ -399,9 +399,9 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + get_cmd_wait_until_cluster_status_contains( cluster_name=name, - cluster_status=ClusterStatus.STOPPED.value, + cluster_status=[ClusterStatus.STOPPED], timeout=80), f'sky start -y {name}', f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 521b08797f5..5f3e3b2117c 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -29,14 +29,14 @@ import pytest from smoke_tests.util import _BUMP_UP_SECONDS from smoke_tests.util import get_cluster_name +from smoke_tests.util import ( + get_cmd_wait_until_managed_job_status_contains_matching_job_name) from smoke_tests.util import GET_JOB_QUEUE from smoke_tests.util import JOB_WAIT_NOT_RUNNING from smoke_tests.util import run_one_test from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test from smoke_tests.util import TestStorageWithCredentials -from smoke_tests.util import ( - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) from sky import jobs from sky.data import storage as storage_lib @@ -58,20 +58,24 @@ def test_managed_jobs(generic_cloud: str): [ f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.PENDING, ManagedJobStatus.INIT, + ManagedJobStatus.RUNNING + ], timeout=60), - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status= - f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.PENDING, ManagedJobStatus.INIT, + ManagedJobStatus.RUNNING + ], timeout=60), f'sky jobs cancel -y -n {name}-1', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', - job_status=f'{ManagedJobStatus.CANCELLED.value}', + job_status=[ManagedJobStatus.CANCELLED], timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', @@ -169,9 +173,9 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): 'managed_jobs_pipeline_failed_setup', [ f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', + job_status=[ManagedJobStatus.FAILED_SETUP], timeout=600), # Make sure the job failed quickly. f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', @@ -206,9 +210,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): 'managed_jobs_recovery_aws', [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. @@ -219,9 +223,9 @@ def test_managed_jobs_recovery_aws(aws_config_region): '--output text)'), JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], @@ -250,18 +254,18 @@ def test_managed_jobs_recovery_gcp(): 'managed_jobs_recovery_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], @@ -285,9 +289,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): 'managed_jobs_pipeline_recovery_aws', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -307,9 +311,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): '--output text)'), JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -340,9 +344,9 @@ def test_managed_jobs_pipeline_recovery_gcp(): 'managed_jobs_pipeline_recovery_gcp', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -354,9 +358,9 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -383,10 +387,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str): 'managed-spot-recovery-default-resources', [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})', + job_status=[ + ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING + ], timeout=360), ], f'sky jobs cancel -y -n {name}', @@ -407,9 +412,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'managed_jobs_recovery_multi_node_aws', [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=450), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. @@ -421,9 +426,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): '--output text)'), JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], @@ -452,18 +457,18 @@ def test_managed_jobs_recovery_multi_node_gcp(): 'managed_jobs_recovery_multi_node_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], @@ -489,15 +494,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})', + job_status=[ + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING + ], timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' @@ -507,14 +513,14 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' @@ -524,9 +530,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', # The job is running in the cluster, will shown as RUNNING. - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -537,9 +543,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region): JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. @@ -575,41 +581,41 @@ def test_managed_jobs_cancellation_gcp(): [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.STARTING.value, + job_status=[ManagedJobStatus.STARTING], timeout=60 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.RUNNING.value, + job_status=[ManagedJobStatus.RUNNING], timeout=300 + _BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=ManagedJobStatus.CANCELLED.value, + job_status=[ManagedJobStatus.CANCELLED], timeout=120 + _BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. @@ -700,9 +706,9 @@ def test_managed_jobs_storage(generic_cloud: str): *STORAGE_SETUP_COMMANDS, f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, + job_status=[ManagedJobStatus.SUCCEEDED], timeout=60 + _BUMP_UP_SECONDS), f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket @@ -727,15 +733,16 @@ def test_managed_jobs_tpu(): 'test-spot-tpu', [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.STARTING.value, + job_status=[ManagedJobStatus.STARTING], timeout=60 + _BUMP_UP_SECONDS), # TPU takes a while to launch - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})', + job_status=[ + ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED + ], timeout=900 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', @@ -754,9 +761,9 @@ def test_managed_jobs_inline_env(generic_cloud: str): 'test-managed-jobs-inline-env', [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=ManagedJobStatus.SUCCEEDED.value, + job_status=[ManagedJobStatus.SUCCEEDED], timeout=20 + _BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py index 3000c82068d..bbfe3874315 100644 --- a/tests/smoke_tests/test_region_and_zone.py +++ b/tests/smoke_tests/test_region_and_zone.py @@ -25,10 +25,10 @@ import pytest from smoke_tests.util import get_cluster_name from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard +from smoke_tests.util import ( + get_cmd_wait_until_managed_job_status_contains_matching_job_name) from smoke_tests.util import run_one_test from smoke_tests.util import Test -from smoke_tests.util import ( - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME) from sky.jobs.state import ManagedJobStatus from sky.skylet import constants @@ -87,10 +87,12 @@ def test_aws_with_ssh_proxy_command(): cluster_status=ClusterStatus.UP.value, timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status= - f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})', + job_status=[ + ManagedJobStatus.SUCCEEDED, ManagedJobStatus.RUNNING, + ManagedJobStatus.STARTING + ], timeout=300), ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py index dd368718821..677db104549 100644 --- a/tests/smoke_tests/test_required_before_merge.py +++ b/tests/smoke_tests/test_required_before_merge.py @@ -20,9 +20,10 @@ # > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws from smoke_tests.util import get_cluster_name +from smoke_tests.util import ( + get_cmd_wait_until_job_status_contains_matching_job_id) from smoke_tests.util import run_one_test from smoke_tests.util import Test -from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID from sky.skylet import events from sky.skylet.job_lib import JobStatus @@ -34,10 +35,10 @@ def test_yaml_launch_and_mount(generic_cloud: str): 'test_yaml_launch_and_mount', [ f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml', - WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, job_id=1, - job_status=JobStatus.SUCCEEDED.value, + job_status=[JobStatus.SUCCEEDED], timeout=2 * 60), ], f'sky down -y {name}', diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index 37b61caa328..0e5c4dd2d8d 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -1,3 +1,4 @@ +import enum import inspect import os import subprocess @@ -56,7 +57,16 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) -WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( + +def _statuses_to_str(statuses: List[enum.Enum]): + """Convert a list of enums to a string with all the values separated by |.""" + if len(statuses) > 1: + return '(' + '|'.join([status.value for status in statuses]) + ')' + else: + return statuses[0].value + + +_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = ( # A while loop to wait until the cluster status # becomes certain status, with timeout. 'start_time=$SECONDS; ' @@ -75,20 +85,29 @@ 'done') +def get_cmd_wait_until_cluster_status_contains( + cluster_name: str, cluster_status: List[ClusterStatus], timeout: int): + return _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format( + cluster_name=cluster_name, + cluster_status=_statuses_to_str(cluster_status), + timeout=timeout) + + def get_cmd_wait_until_cluster_status_contains_wildcard( - cluster_name_wildcard: str, cluster_status: str, timeout: int): - wait_cmd = WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( + cluster_name_wildcard: str, cluster_status: List[ClusterStatus], + timeout: int): + wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace( 'sky status {cluster_name}', 'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/', 'awk "/^{cluster_name_awk}/') return wait_cmd.format(cluster_name=cluster_name_wildcard, cluster_name_awk=cluster_name_wildcard.replace( '*', '.*'), - cluster_status=cluster_status, + cluster_status=_statuses_to_str(cluster_status), timeout=timeout) -WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( +_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = ( # A while loop to wait until the cluster is not found or timeout 'start_time=$SECONDS; ' 'while true; do ' @@ -98,11 +117,17 @@ def get_cmd_wait_until_cluster_status_contains_wildcard( 'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then ' ' echo "Cluster {cluster_name} successfully removed."; break; ' 'fi; ' - 'echo "Waiting for cluster {name} to be removed..."; ' + 'echo "Waiting for cluster {cluster_name} to be removed..."; ' 'sleep 10; ' 'done') -WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( + +def get_cmd_wait_until_cluster_is_not_found(cluster_name: str, timeout: int): + return _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=cluster_name, + timeout=timeout) + + +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = ( # A while loop to wait until the job status # contains certain status, with timeout. 'start_time=$SECONDS; ' @@ -127,20 +152,58 @@ def get_cmd_wait_until_cluster_status_contains_wildcard( 'sleep 10; ' 'done') -WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "') -WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( +_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace( 'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"') + +def get_cmd_wait_until_job_status_contains_matching_job_id( + cluster_name: str, job_id: str, job_status: List[JobStatus], + timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format( + cluster_name=cluster_name, + job_id=job_id, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + +def get_cmd_wait_until_job_status_contains_without_matching_job( + cluster_name: str, job_status: List[JobStatus], timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format( + cluster_name=cluster_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + +def get_cmd_wait_until_job_status_contains_matching_job_name( + cluster_name: str, job_name: str, job_status: List[JobStatus], + timeout: int): + return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + cluster_name=cluster_name, + job_name=job_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + # Managed job functions -WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( +_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace( 'sky queue {cluster_name}', 'sky jobs queue').replace( 'awk "\\$2 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace( _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES) + +def get_cmd_wait_until_managed_job_status_contains_matching_job_name( + job_name: str, job_status: List[JobStatus], timeout: int): + return _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + job_name=job_name, + job_status=_statuses_to_str(job_status), + timeout=timeout) + + # After the timeout, the cluster will stop if autostop is set, and our check # should be more than the timeout. To address this, we extend the timeout by # _BUMP_UP_SECONDS before exiting. From 41bac9bc9eb838dfbf4bd14d56ac15cc7fca4be1 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 12:13:52 +0800 Subject: [PATCH 28/64] bug fix --- tests/smoke_tests/test_managed_job.py | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 5f3e3b2117c..f41e0a6c2ca 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -27,7 +27,7 @@ import time import pytest -from smoke_tests.util import _BUMP_UP_SECONDS +from smoke_tests.util import BUMP_UP_SECONDS from smoke_tests.util import get_cluster_name from smoke_tests.util import ( get_cmd_wait_until_managed_job_status_contains_matching_job_name) @@ -147,10 +147,10 @@ def test_managed_jobs_failed_setup(generic_cloud: str): [ f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', # Make sure the job failed quickly. - WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format( + get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=f'{ManagedJobStatus.FAILED_SETUP.value}', - timeout=330 + _BUMP_UP_SECONDS), + job_status=[ManagedJobStatus.FAILED_SETUP], + timeout=330 + BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -499,12 +499,12 @@ def test_managed_jobs_cancellation_aws(aws_config_region): job_status=[ ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING ], - timeout=60 + _BUMP_UP_SECONDS), + timeout=60 + BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -516,12 +516,12 @@ def test_managed_jobs_cancellation_aws(aws_config_region): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ManagedJobStatus.RUNNING], - timeout=300 + _BUMP_UP_SECONDS), + timeout=300 + BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -533,7 +533,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', job_status=[ManagedJobStatus.RUNNING], - timeout=300 + _BUMP_UP_SECONDS), + timeout=300 + BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' @@ -546,7 +546,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$(aws ec2 describe-instances --region {region} ' @@ -584,30 +584,30 @@ def test_managed_jobs_cancellation_gcp(): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.STARTING], - timeout=60 + _BUMP_UP_SECONDS), + timeout=60 + BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ManagedJobStatus.RUNNING], - timeout=300 + _BUMP_UP_SECONDS), + timeout=300 + BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', job_status=[ManagedJobStatus.RUNNING], - timeout=300 + _BUMP_UP_SECONDS), + timeout=300 + BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), @@ -616,7 +616,7 @@ def test_managed_jobs_cancellation_gcp(): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + _BUMP_UP_SECONDS), + timeout=120 + BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' @@ -709,7 +709,7 @@ def test_managed_jobs_storage(generic_cloud: str): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.SUCCEEDED], - timeout=60 + _BUMP_UP_SECONDS), + timeout=60 + BUMP_UP_SECONDS), f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd @@ -736,14 +736,14 @@ def test_managed_jobs_tpu(): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.STARTING], - timeout=60 + _BUMP_UP_SECONDS), + timeout=60 + BUMP_UP_SECONDS), # TPU takes a while to launch get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED ], - timeout=900 + _BUMP_UP_SECONDS), + timeout=900 + BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. @@ -764,7 +764,7 @@ def test_managed_jobs_inline_env(generic_cloud: str): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ManagedJobStatus.SUCCEEDED], - timeout=20 + _BUMP_UP_SECONDS), + timeout=20 + BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. From fd46f09a135c1c8e2bce702b66493b4c53279186 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 13:05:59 +0800 Subject: [PATCH 29/64] bug fix --- tests/smoke_tests/test_managed_job.py | 2 +- tests/smoke_tests/test_mount_and_storage.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index f41e0a6c2ca..44ab29705ea 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -27,6 +27,7 @@ import time import pytest +from smoke_tests.test_mount_and_storage import TestStorageWithCredentials from smoke_tests.util import BUMP_UP_SECONDS from smoke_tests.util import get_cluster_name from smoke_tests.util import ( @@ -36,7 +37,6 @@ from smoke_tests.util import run_one_test from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test -from smoke_tests.util import TestStorageWithCredentials from sky import jobs from sky.data import storage as storage_lib diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py index 95952d3b432..6a2f0944fec 100644 --- a/tests/smoke_tests/test_mount_and_storage.py +++ b/tests/smoke_tests/test_mount_and_storage.py @@ -38,7 +38,6 @@ from smoke_tests.util import SCP_TYPE from smoke_tests.util import STORAGE_SETUP_COMMANDS from smoke_tests.util import Test -from smoke_tests.util import TestStorageWithCredentials import sky from sky import global_user_state From dc71b72a9990d7709f409cc1f425f1e77341001e Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 13:17:35 +0800 Subject: [PATCH 30/64] bug fix --- tests/smoke_tests/test_basic.py | 5 ++++- tests/smoke_tests/test_managed_job.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index 1f76254b67d..8239b25db35 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -26,6 +26,7 @@ import time import pytest +from smoke_tests.util import BUMP_UP_SECONDS from smoke_tests.util import get_cluster_name from smoke_tests.util import get_cmd_wait_until_cluster_status_contains from smoke_tests.util import ( @@ -147,7 +148,9 @@ def test_launch_fast_with_autostop(generic_cloud: str): cluster_name=name, cluster_status=[ClusterStatus.STOPPED], timeout=autostop_timeout), - + # Even the cluster is stopped, cloud platform may take a while to + # delete the VM. + f'sleep {BUMP_UP_SECONDS}', # Launch again. Do full output validation - we expect the cluster to re-launch f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 44ab29705ea..e8d13c21354 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -61,15 +61,15 @@ def test_managed_jobs(generic_cloud: str): get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.INIT, - ManagedJobStatus.RUNNING + ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING ], timeout=60), get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.INIT, - ManagedJobStatus.RUNNING + ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, + ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING ], timeout=60), f'sky jobs cancel -y -n {name}-1', From e68430be2e17cf950f0bbdae9c9eff8e49a5be89 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 15:17:12 +0800 Subject: [PATCH 31/64] test pipeline pre merge --- .buildkite/pipeline_pre_merge.yaml | 11 +++++++++++ .buildkite/trigger_build.sh | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 .buildkite/pipeline_pre_merge.yaml create mode 100644 .buildkite/trigger_build.sh diff --git a/.buildkite/pipeline_pre_merge.yaml b/.buildkite/pipeline_pre_merge.yaml new file mode 100644 index 00000000000..4edeb3328fd --- /dev/null +++ b/.buildkite/pipeline_pre_merge.yaml @@ -0,0 +1,11 @@ +steps: + - label: "Validation check" + command: "./buildkite/trigger_build.sh pre-merge-test" + key: "validation-check" + - label: "Run pre merge tests" + command: | + if [ $$(buildkite-agent step get "outcome" --step "validation-check") == "passed" ]; then + buildkite-agent pipeline upload .buildkite/pipeline_smoke_test_required_before_merge.yaml + else + echo "Didn't pass validation, nothing to run" + fi diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh new file mode 100644 index 00000000000..f40c64f60c8 --- /dev/null +++ b/.buildkite/trigger_build.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Parse the webhook payload (read from stdin) +PAYLOAD=$(cat) + +# Define the allowed user(s) +ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames + +# Extract comment body and user info +COMMENT_BODY=$(echo "$PAYLOAD" | jq -r '.comment.body') +COMMENT_USER=$(echo "$PAYLOAD" | jq -r '.comment.user.login') + +# Read the keyword from the first argument +KEYWORD="$1" + +# Check if the comment contains the keyword and the user is authorized +if [[ "$COMMENT_BODY" == *"$KEYWORD"* ]] && + ( [[ " ${ALLOWED_USERS[@]} " =~ " $COMMENT_USER " ]]); then + echo "Triggering build because $KEYWORD was mentioned by authorized user: $COMMENT_USER" + exit 0 # Exit with success to continue the build +else + echo "Build not triggered. Either $KEYWORD not found or user not authorized." + exit 1 # Exit with failure to stop the build +fi From d2ab7baf92e2960b61abd3e513d15f42b0d2f389 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 15:24:56 +0800 Subject: [PATCH 32/64] build test --- .buildkite/trigger_build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh index f40c64f60c8..ff57b506c9d 100644 --- a/.buildkite/trigger_build.sh +++ b/.buildkite/trigger_build.sh @@ -3,6 +3,9 @@ # Parse the webhook payload (read from stdin) PAYLOAD=$(cat) +echo "PAYLOAD: $PAYLOAD" +echo "KEYWORD: $1" + # Define the allowed user(s) ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames From d2a065e1187990cae878ead7c18d3b2e26b86edc Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 15:52:58 +0800 Subject: [PATCH 33/64] test again --- .buildkite/generate_pipeline.py | 121 ----------------------------- .buildkite/pipeline_pre_merge.yaml | 11 --- .buildkite/trigger_build.sh | 27 ------- 3 files changed, 159 deletions(-) delete mode 100644 .buildkite/generate_pipeline.py delete mode 100644 .buildkite/pipeline_pre_merge.yaml delete mode 100644 .buildkite/trigger_build.sh diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py deleted file mode 100644 index cb135b41a61..00000000000 --- a/.buildkite/generate_pipeline.py +++ /dev/null @@ -1,121 +0,0 @@ -"""This script generates a Buildkite pipeline from test files.""" -import ast -import copy -import os -from typing import Any, Dict, List - -import yaml - -DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] -# We only have credentials for aws, azure, and gcp. -# For those test cases that run on other clouds, -# we currently ignore them. -ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp'] - - -def _get_full_decorator_path(decorator: ast.AST) -> str: - """Recursively get the full path of a decorator.""" - if isinstance(decorator, ast.Attribute): - return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}' - elif isinstance(decorator, ast.Name): - return decorator.id - elif isinstance(decorator, ast.Call): - return _get_full_decorator_path(decorator.func) - raise ValueError(f'Unknown decorator type: {type(decorator)}') - - -def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: - """Extract test functions and filter clouds with pytest.mark - from a Python test file.""" - with open(file_path, 'r', encoding='utf-8') as file: - tree = ast.parse(file.read(), filename=file_path) - - for node in ast.walk(tree): - for child in ast.iter_child_nodes(node): - setattr(child, 'parent', node) - - function_cloud_map = {} - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'): - class_name = None - if hasattr(node, 'parent') and isinstance(node.parent, - ast.ClassDef): - class_name = node.parent.name - - clouds_to_include = [] - clouds_to_exclude = [] - for decorator in node.decorator_list: - if isinstance(decorator, ast.Call): - # We only need to consider the decorator with no arguments - # to extract clouds. - continue - full_path = _get_full_decorator_path(decorator) - if full_path.startswith('pytest.mark.'): - assert isinstance(decorator, ast.Attribute) - suffix = decorator.attr - if suffix.startswith('no_'): - clouds_to_exclude.append(suffix[3:]) - else: - clouds_to_include.append(suffix) - clouds_to_include = (clouds_to_include if clouds_to_include else - copy.deepcopy(DEFAULT_CLOUDS_TO_RUN)) - clouds_to_include = [ - cloud for cloud in clouds_to_include - if cloud not in clouds_to_exclude - ] - final_clouds_to_include = [ - cloud for cloud in clouds_to_include - if cloud in ALL_CLOUDS_WITH_CREDENTIALS - ] - if clouds_to_include and not final_clouds_to_include: - print(f'Warning: {file_path}:{node.name} ' - f'is marked to run on {clouds_to_include}, ' - f'but we do not have credentials for those clouds. ' - f'Skipped.') - continue - function_name = (f'{class_name}::{node.name}' - if class_name else node.name) - function_cloud_map[function_name] = (clouds_to_include) - return function_cloud_map - - -def _generate_pipeline(test_file: str) -> Dict[str, Any]: - """Generate a Buildkite pipeline from test files.""" - steps = [] - function_cloud_map = _extract_marked_tests(test_file) - for test_function, clouds in function_cloud_map.items(): - for cloud in clouds: - step = { - 'label': f'{test_function} on {cloud}', - 'command': f'pytest {test_file}::{test_function} --{cloud}', - 'env': { - 'LOG_TO_STDOUT': '1' - } - } - steps.append(step) - # we only run one cloud per test function for now - break - return {'steps': steps} - - -def main(): - # List of test files to include in the pipeline - test_files = os.listdir('tests/smoke_tests') - - for test_file in test_files: - if not test_file.startswith('test_'): - continue - test_file_path = os.path.join('tests/smoke_tests', test_file) - pipeline = _generate_pipeline(test_file_path) - yaml_file_path = '.buildkite/pipeline_smoke_' + \ - f'{test_file.split(".")[0]}.yaml' - with open(yaml_file_path, 'w', encoding='utf-8') as file: - file.write('# This is an auto-generated Buildkite pipeline by ' - '.buildkite/generate_pipeline.py, Please do not ' - 'edit directly.\n') - yaml.dump(pipeline, file, default_flow_style=False) - print(f'Convert {test_file_path} to {yaml_file_path}\n\n') - - -if __name__ == '__main__': - main() diff --git a/.buildkite/pipeline_pre_merge.yaml b/.buildkite/pipeline_pre_merge.yaml deleted file mode 100644 index 4edeb3328fd..00000000000 --- a/.buildkite/pipeline_pre_merge.yaml +++ /dev/null @@ -1,11 +0,0 @@ -steps: - - label: "Validation check" - command: "./buildkite/trigger_build.sh pre-merge-test" - key: "validation-check" - - label: "Run pre merge tests" - command: | - if [ $$(buildkite-agent step get "outcome" --step "validation-check") == "passed" ]; then - buildkite-agent pipeline upload .buildkite/pipeline_smoke_test_required_before_merge.yaml - else - echo "Didn't pass validation, nothing to run" - fi diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh deleted file mode 100644 index ff57b506c9d..00000000000 --- a/.buildkite/trigger_build.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# Parse the webhook payload (read from stdin) -PAYLOAD=$(cat) - -echo "PAYLOAD: $PAYLOAD" -echo "KEYWORD: $1" - -# Define the allowed user(s) -ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames - -# Extract comment body and user info -COMMENT_BODY=$(echo "$PAYLOAD" | jq -r '.comment.body') -COMMENT_USER=$(echo "$PAYLOAD" | jq -r '.comment.user.login') - -# Read the keyword from the first argument -KEYWORD="$1" - -# Check if the comment contains the keyword and the user is authorized -if [[ "$COMMENT_BODY" == *"$KEYWORD"* ]] && - ( [[ " ${ALLOWED_USERS[@]} " =~ " $COMMENT_USER " ]]); then - echo "Triggering build because $KEYWORD was mentioned by authorized user: $COMMENT_USER" - exit 0 # Exit with success to continue the build -else - echo "Build not triggered. Either $KEYWORD not found or user not authorized." - exit 1 # Exit with failure to stop the build -fi From ab6a3112d2380e819bf95aca0036f29616f0a2bc Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 25 Nov 2024 18:13:46 +0800 Subject: [PATCH 34/64] trigger test --- tests/test_smoke.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d1dc2129422..c872fd589f8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -27,6 +27,7 @@ # All files categorized under tests/smoke_tests/* # Please add new test cases under that directory. + from smoke_tests.test_basic import * from smoke_tests.test_cluster_job import * from smoke_tests.test_images import * From 2ada082d451244346bd875b9cd0b2cd422010514 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 26 Nov 2024 14:04:04 +0800 Subject: [PATCH 35/64] bug fix --- tests/smoke_tests/test_region_and_zone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py index bbfe3874315..481d1488071 100644 --- a/tests/smoke_tests/test_region_and_zone.py +++ b/tests/smoke_tests/test_region_and_zone.py @@ -84,7 +84,7 @@ def test_aws_with_ssh_proxy_command(): # the job controller is not launched with proxy command. get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', - cluster_status=ClusterStatus.UP.value, + cluster_status=[ClusterStatus.UP], timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', get_cmd_wait_until_managed_job_status_contains_matching_job_name( From 9e1416827ec871485ef871be0ec08f54578494ea Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 27 Nov 2024 16:24:27 +0800 Subject: [PATCH 36/64] generate pipeline --- .buildkite/generate_pipeline.py | 121 ++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 .buildkite/generate_pipeline.py diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py new file mode 100644 index 00000000000..cb135b41a61 --- /dev/null +++ b/.buildkite/generate_pipeline.py @@ -0,0 +1,121 @@ +"""This script generates a Buildkite pipeline from test files.""" +import ast +import copy +import os +from typing import Any, Dict, List + +import yaml + +DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] +# We only have credentials for aws, azure, and gcp. +# For those test cases that run on other clouds, +# we currently ignore them. +ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp'] + + +def _get_full_decorator_path(decorator: ast.AST) -> str: + """Recursively get the full path of a decorator.""" + if isinstance(decorator, ast.Attribute): + return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}' + elif isinstance(decorator, ast.Name): + return decorator.id + elif isinstance(decorator, ast.Call): + return _get_full_decorator_path(decorator.func) + raise ValueError(f'Unknown decorator type: {type(decorator)}') + + +def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: + """Extract test functions and filter clouds with pytest.mark + from a Python test file.""" + with open(file_path, 'r', encoding='utf-8') as file: + tree = ast.parse(file.read(), filename=file_path) + + for node in ast.walk(tree): + for child in ast.iter_child_nodes(node): + setattr(child, 'parent', node) + + function_cloud_map = {} + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'): + class_name = None + if hasattr(node, 'parent') and isinstance(node.parent, + ast.ClassDef): + class_name = node.parent.name + + clouds_to_include = [] + clouds_to_exclude = [] + for decorator in node.decorator_list: + if isinstance(decorator, ast.Call): + # We only need to consider the decorator with no arguments + # to extract clouds. + continue + full_path = _get_full_decorator_path(decorator) + if full_path.startswith('pytest.mark.'): + assert isinstance(decorator, ast.Attribute) + suffix = decorator.attr + if suffix.startswith('no_'): + clouds_to_exclude.append(suffix[3:]) + else: + clouds_to_include.append(suffix) + clouds_to_include = (clouds_to_include if clouds_to_include else + copy.deepcopy(DEFAULT_CLOUDS_TO_RUN)) + clouds_to_include = [ + cloud for cloud in clouds_to_include + if cloud not in clouds_to_exclude + ] + final_clouds_to_include = [ + cloud for cloud in clouds_to_include + if cloud in ALL_CLOUDS_WITH_CREDENTIALS + ] + if clouds_to_include and not final_clouds_to_include: + print(f'Warning: {file_path}:{node.name} ' + f'is marked to run on {clouds_to_include}, ' + f'but we do not have credentials for those clouds. ' + f'Skipped.') + continue + function_name = (f'{class_name}::{node.name}' + if class_name else node.name) + function_cloud_map[function_name] = (clouds_to_include) + return function_cloud_map + + +def _generate_pipeline(test_file: str) -> Dict[str, Any]: + """Generate a Buildkite pipeline from test files.""" + steps = [] + function_cloud_map = _extract_marked_tests(test_file) + for test_function, clouds in function_cloud_map.items(): + for cloud in clouds: + step = { + 'label': f'{test_function} on {cloud}', + 'command': f'pytest {test_file}::{test_function} --{cloud}', + 'env': { + 'LOG_TO_STDOUT': '1' + } + } + steps.append(step) + # we only run one cloud per test function for now + break + return {'steps': steps} + + +def main(): + # List of test files to include in the pipeline + test_files = os.listdir('tests/smoke_tests') + + for test_file in test_files: + if not test_file.startswith('test_'): + continue + test_file_path = os.path.join('tests/smoke_tests', test_file) + pipeline = _generate_pipeline(test_file_path) + yaml_file_path = '.buildkite/pipeline_smoke_' + \ + f'{test_file.split(".")[0]}.yaml' + with open(yaml_file_path, 'w', encoding='utf-8') as file: + file.write('# This is an auto-generated Buildkite pipeline by ' + '.buildkite/generate_pipeline.py, Please do not ' + 'edit directly.\n') + yaml.dump(pipeline, file, default_flow_style=False) + print(f'Convert {test_file_path} to {yaml_file_path}\n\n') + + +if __name__ == '__main__': + main() From a2b04154b8f506b9aa1899b17673772aa83cdd4c Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 27 Nov 2024 16:34:18 +0800 Subject: [PATCH 37/64] robust generate pipeline --- .buildkite/generate_pipeline.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index cb135b41a61..56f65d21460 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -12,6 +12,11 @@ # we currently ignore them. ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp'] +ALL_CLOUDS_IN_SMOKE_TESTS = [ + 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', + 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace' +] + def _get_full_decorator_path(decorator: ast.AST) -> str: """Recursively get the full path of a decorator.""" @@ -56,6 +61,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: if suffix.startswith('no_'): clouds_to_exclude.append(suffix[3:]) else: + if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS: + # This mark does not specify a cloud, so we skip it. + continue clouds_to_include.append(suffix) clouds_to_include = (clouds_to_include if clouds_to_include else copy.deepcopy(DEFAULT_CLOUDS_TO_RUN)) From e764192f9c2f19b5b0aface270d8faa859bba3ce Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 28 Nov 2024 16:41:32 +0800 Subject: [PATCH 38/64] refactor pipeline --- .buildkite/generate_pipeline.py | 24 +- .buildkite/pipeline_smoke_test_basic.yaml | 82 -- .../pipeline_smoke_test_cluster_job.yaml | 178 ---- .buildkite/pipeline_smoke_test_images.yaml | 66 -- .../pipeline_smoke_test_managed_job.yaml | 46 - ...pipeline_smoke_test_mount_and_storage.yaml | 114 --- .../pipeline_smoke_test_region_and_zone.yaml | 28 - ...line_smoke_test_required_before_merge.yaml | 7 - .buildkite/pipeline_smoke_test_sky_serve.yaml | 33 - .../pipeline_smoke_tests_pre_merge.yaml | 8 + .buildkite/pipeline_smoke_tests_release.yaml | 874 ++++++++++++++++++ 11 files changed, 900 insertions(+), 560 deletions(-) delete mode 100644 .buildkite/pipeline_smoke_test_basic.yaml delete mode 100644 .buildkite/pipeline_smoke_test_cluster_job.yaml delete mode 100644 .buildkite/pipeline_smoke_test_images.yaml delete mode 100644 .buildkite/pipeline_smoke_test_managed_job.yaml delete mode 100644 .buildkite/pipeline_smoke_test_mount_and_storage.yaml delete mode 100644 .buildkite/pipeline_smoke_test_region_and_zone.yaml delete mode 100644 .buildkite/pipeline_smoke_test_required_before_merge.yaml delete mode 100644 .buildkite/pipeline_smoke_test_sky_serve.yaml create mode 100644 .buildkite/pipeline_smoke_tests_pre_merge.yaml create mode 100644 .buildkite/pipeline_smoke_tests_release.yaml diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 56f65d21460..6d88e2d48d2 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -1,5 +1,6 @@ """This script generates a Buildkite pipeline from test files.""" import ast +from collections import defaultdict import copy import os from typing import Any, Dict, List @@ -10,7 +11,7 @@ # We only have credentials for aws, azure, and gcp. # For those test cases that run on other clouds, # we currently ignore them. -ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp'] +ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp', 'kubernetes'] ALL_CLOUDS_IN_SMOKE_TESTS = [ 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', @@ -97,7 +98,8 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]: 'label': f'{test_function} on {cloud}', 'command': f'pytest {test_file}::{test_function} --{cloud}', 'env': { - 'LOG_TO_STDOUT': '1' + 'LOG_TO_STDOUT': '1', + 'PYTHONPATH': '${PYTHONPATH}:$(pwd)' } } steps.append(step) @@ -109,20 +111,30 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]: def main(): # List of test files to include in the pipeline test_files = os.listdir('tests/smoke_tests') + output_file_pipelines_map = defaultdict(list) for test_file in test_files: if not test_file.startswith('test_'): continue test_file_path = os.path.join('tests/smoke_tests', test_file) + if test_file == 'test_required_before_merge.py': + yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' + else: + yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml' + print(f'Converting {test_file_path} to {yaml_file_path}') pipeline = _generate_pipeline(test_file_path) - yaml_file_path = '.buildkite/pipeline_smoke_' + \ - f'{test_file.split(".")[0]}.yaml' + output_file_pipelines_map[yaml_file_path].append(pipeline) + print(f'Converted {test_file_path} to {yaml_file_path}\n\n') + + for yaml_file_path, pipelines in output_file_pipelines_map.items(): with open(yaml_file_path, 'w', encoding='utf-8') as file: file.write('# This is an auto-generated Buildkite pipeline by ' '.buildkite/generate_pipeline.py, Please do not ' 'edit directly.\n') - yaml.dump(pipeline, file, default_flow_style=False) - print(f'Convert {test_file_path} to {yaml_file_path}\n\n') + final_pipeline = { + 'steps': [pipeline['steps'] for pipeline in pipelines] + } + yaml.dump(final_pipeline, file, default_flow_style=False) if __name__ == '__main__': diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml deleted file mode 100644 index d0ba641c48c..00000000000 --- a/.buildkite/pipeline_smoke_test_basic.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws - env: - LOG_TO_STDOUT: '1' - label: test_example_app on aws -- command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws - env: - LOG_TO_STDOUT: '1' - label: test_minimal on aws -- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws - env: - LOG_TO_STDOUT: '1' - label: test_launch_fast on aws -- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop - --aws - env: - LOG_TO_STDOUT: '1' - label: test_launch_fast_with_autostop on aws -- command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws - env: - LOG_TO_STDOUT: '1' - label: test_stale_job on aws -- command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_stale_job_manual_restart on aws -- command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_stale_job_manual_restart on gcp -- command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws - env: - LOG_TO_STDOUT: '1' - label: test_env_check on aws -- command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws - env: - LOG_TO_STDOUT: '1' - label: test_cli_logs on aws -- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp - env: - LOG_TO_STDOUT: '1' - label: test_core_api_sky_launch_exec on gcp -- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws - env: - LOG_TO_STDOUT: '1' - label: test_core_api_sky_launch_fast on aws -- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered - --aws - env: - LOG_TO_STDOUT: '1' - label: test_multiple_accelerators_ordered on aws -- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default - --aws - env: - LOG_TO_STDOUT: '1' - label: test_multiple_accelerators_ordered_with_default on aws -- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered - --aws - env: - LOG_TO_STDOUT: '1' - label: test_multiple_accelerators_unordered on aws -- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default - --aws - env: - LOG_TO_STDOUT: '1' - label: test_multiple_accelerators_unordered_with_default on aws -- command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws - env: - LOG_TO_STDOUT: '1' - label: test_multiple_resources on aws -- command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - env: - LOG_TO_STDOUT: '1' - label: test_sky_bench on aws -- command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent - --aws - env: - LOG_TO_STDOUT: '1' - label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml deleted file mode 100644 index 8a813119eb2..00000000000 --- a/.buildkite/pipeline_smoke_test_cluster_job.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - env: - LOG_TO_STDOUT: '1' - label: test_job_queue on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker - --aws - env: - LOG_TO_STDOUT: '1' - label: test_job_queue_with_docker on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode - --aws - env: - LOG_TO_STDOUT: '1' - label: test_job_queue_multinode on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws - env: - LOG_TO_STDOUT: '1' - label: test_large_job_queue on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue - --aws - env: - LOG_TO_STDOUT: '1' - label: test_fast_large_job_queue on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package - --aws - env: - LOG_TO_STDOUT: '1' - label: test_docker_preinstalled_package on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws - env: - LOG_TO_STDOUT: '1' - label: test_multi_echo on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws - env: - LOG_TO_STDOUT: '1' - label: test_huggingface on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - env: - LOG_TO_STDOUT: '1' - label: test_inferentia on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp - env: - LOG_TO_STDOUT: '1' - label: test_tpu on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - env: - LOG_TO_STDOUT: '1' - label: test_tpu_vm on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp - env: - LOG_TO_STDOUT: '1' - label: test_tpu_vm_pod on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws - env: - LOG_TO_STDOUT: '1' - label: test_multi_hostname on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws - env: - LOG_TO_STDOUT: '1' - label: test_multi_node_failure on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_http_server_with_custom_ports on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_http_server_with_custom_ports on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports - --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_http_server_with_custom_ports on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws - env: - LOG_TO_STDOUT: '1' - label: test_task_labels_aws on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - env: - LOG_TO_STDOUT: '1' - label: test_task_labels_gcp on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws - env: - LOG_TO_STDOUT: '1' - label: test_distributed_tf on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_start_stop on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_start_stop on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws - env: - LOG_TO_STDOUT: '1' - label: test_autostop on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws - env: - LOG_TO_STDOUT: '1' - label: test_autodown on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - env: - LOG_TO_STDOUT: '1' - label: test_cancel_aws on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - env: - LOG_TO_STDOUT: '1' - label: test_cancel_gcp on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - env: - LOG_TO_STDOUT: '1' - label: test_cancel_azure on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - env: - LOG_TO_STDOUT: '1' - label: test_cancel_pytorch on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - env: - LOG_TO_STDOUT: '1' - label: test_use_spot on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - env: - LOG_TO_STDOUT: '1' - label: test_stop_gcp_spot on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - env: - LOG_TO_STDOUT: '1' - label: test_inline_env on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - env: - LOG_TO_STDOUT: '1' - label: test_inline_env_file on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_custom_image on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_start_stop_two_nodes on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_disk_tier on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_disk_tier on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_disk_tier on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover - --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_best_tier_failover on azure -- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_zero_quota_failover on aws -- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_zero_quota_failover on gcp -- command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script - --aws - env: - LOG_TO_STDOUT: '1' - label: test_long_setup_run_script on aws diff --git a/.buildkite/pipeline_smoke_test_images.yaml b/.buildkite/pipeline_smoke_test_images.yaml deleted file mode 100644 index 4991fccbbc7..00000000000 --- a/.buildkite/pipeline_smoke_test_images.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_images on aws -- command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_images on gcp -- command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_images on azure -- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_image_id_dict on aws -- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_image_id_dict on gcp -- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_image_id_dict_region on aws -- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_image_id_dict_region on gcp -- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_image_id_dict_zone on aws -- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_image_id_dict_zone on gcp -- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws - env: - LOG_TO_STDOUT: '1' - label: test_clone_disk_aws on aws -- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp - env: - LOG_TO_STDOUT: '1' - label: test_clone_disk_gcp on gcp -- command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_mig on gcp -- command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_force_enable_external_ips on gcp -- command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - env: - LOG_TO_STDOUT: '1' - label: test_image_no_conda on aws -- command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env - --aws - env: - LOG_TO_STDOUT: '1' - label: test_custom_default_conda_env on aws diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml deleted file mode 100644 index fee2ae1f3c8..00000000000 --- a/.buildkite/pipeline_smoke_test_managed_job.yaml +++ /dev/null @@ -1,46 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws - --aws - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_recovery_aws on aws -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_recovery_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws - --aws - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_pipeline_recovery_aws on aws -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_pipeline_recovery_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws - --aws - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_recovery_multi_node_aws on aws -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_recovery_multi_node_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws - --aws - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_cancellation_aws on aws -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_cancellation_gcp on gcp -- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - env: - LOG_TO_STDOUT: '1' - label: test_managed_jobs_tpu on gcp diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml deleted file mode 100644 index 01f8739dd79..00000000000 --- a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml +++ /dev/null @@ -1,114 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws - env: - LOG_TO_STDOUT: '1' - label: test_file_mounts on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars - --aws - env: - LOG_TO_STDOUT: '1' - label: test_using_file_mounts_with_env_vars on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_storage_mounts_with_stop on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_storage_mounts_with_stop on gcp -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_storage_mounts_with_stop on azure -- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws - env: - LOG_TO_STDOUT: '1' - label: test_docker_storage_mounts on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on - aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_bucket_external_deletion on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_public_bucket on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_nonexistent_bucket on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_private_bucket on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_list_source on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_invalid_names on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_aws_regions on aws -- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws - env: - LOG_TO_STDOUT: '1' - label: TestStorageWithCredentials::test_gcs_regions on aws diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml deleted file mode 100644 index aa955bc1864..00000000000 --- a/.buildkite/pipeline_smoke_test_region_and_zone.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_region on aws -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command - --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_with_ssh_proxy_command on aws -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_region_and_service_account on gcp -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - env: - LOG_TO_STDOUT: '1' - label: test_azure_region on azure -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws - env: - LOG_TO_STDOUT: '1' - label: test_aws_zone on aws -- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - env: - LOG_TO_STDOUT: '1' - label: test_gcp_zone on gcp diff --git a/.buildkite/pipeline_smoke_test_required_before_merge.yaml b/.buildkite/pipeline_smoke_test_required_before_merge.yaml deleted file mode 100644 index 8a29f838e4e..00000000000 --- a/.buildkite/pipeline_smoke_test_required_before_merge.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --aws - env: - LOG_TO_STDOUT: '1' - label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml deleted file mode 100644 index 4cd4d35aa4d..00000000000 --- a/.buildkite/pipeline_smoke_test_sky_serve.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -steps: -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_gcp_http on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_aws_http on aws -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_azure_http on azure -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_spot_recovery on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_dynamic_ondemand_fallback on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - env: - LOG_TO_STDOUT: '1' - label: test_skyserve_auto_restart on gcp -- command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - env: - LOG_TO_STDOUT: '1' - label: test_user_dependencies on aws diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml new file mode 100644 index 00000000000..a1f68140299 --- /dev/null +++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml @@ -0,0 +1,8 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml new file mode 100644 index 00000000000..6a3ec46d52d --- /dev/null +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -0,0 +1,874 @@ +# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +steps: +- - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_example_app on aws + - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_minimal on aws + - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_launch_fast on aws + - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_launch_fast_with_autostop on aws + - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_stale_job on aws + - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_stale_job_manual_restart on aws + - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_stale_job_manual_restart on gcp + - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_env_check on aws + - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_cli_logs on aws + - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_core_api_sky_launch_exec on gcp + - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_core_api_sky_launch_fast on aws + - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multiple_accelerators_ordered on aws + - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multiple_accelerators_ordered_with_default on aws + - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multiple_accelerators_unordered on aws + - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multiple_accelerators_unordered_with_default on aws + - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multiple_resources on aws + - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_sky_bench on aws + - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_kubernetes_context_failover on kubernetes + - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws +- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_job_queue_with_docker on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_lambda_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_job_queue_multinode on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_large_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_fast_large_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_docker_preinstalled_package on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multi_echo on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_huggingface on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_lambda_huggingface on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_inferentia on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_tpu on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_tpu_vm on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_tpu_vm_pod on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_tpu_pod_slice_gke on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multi_hostname on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_multi_node_failure on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_http_server_with_custom_ports on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_http_server_with_custom_ports on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_http_server_with_custom_ports on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_kubernetes_http_server_with_custom_ports on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_runpod_http_server_with_custom_ports on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_task_labels_aws on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_task_labels_gcp on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_task_labels_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_container_logs_multinode_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_container_logs_two_jobs_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_distributed_tf on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_start_stop on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_start_stop on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_autostop on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_autodown on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_cancel_aws on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_cancel_gcp on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_cancel_azure on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_cancel_pytorch on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_use_spot on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_stop_gcp_spot on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_inline_env on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_inline_env_file on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_custom_image on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_kubernetes_custom_image on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_start_stop_two_nodes on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_disk_tier on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_disk_tier on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_disk_tier on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_best_tier_failover on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_zero_quota_failover on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_zero_quota_failover on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_long_setup_run_script on aws +- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_job_pipeline on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_failed_setup on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_pipeline_failed_setup on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_recovery_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_recovery_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_pipeline_recovery_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_pipeline_recovery_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_recovery_default_resources on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_recovery_multi_node_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_recovery_multi_node_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_cancellation_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_cancellation_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_storage on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_tpu on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_managed_jobs_inline_env on aws +- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_images on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_images on gcp + - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_images on azure + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict on gcp + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict_region on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict_region on gcp + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict_zone on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict_zone on gcp + - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_clone_disk_aws on aws + - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_clone_disk_gcp on gcp + - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_mig on gcp + - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_force_enable_external_ips on gcp + - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_image_no_conda on aws + - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_custom_default_conda_env on aws +- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_gcp_http on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_aws_http on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http + --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_azure_http on azure + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_kubernetes_http on kubernetes + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_llm on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_spot_recovery on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_base_ondemand_fallback on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_dynamic_ondemand_fallback on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_user_bug_restart on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_load_balancer on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_auto_restart on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_cancel on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_streaming on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_readiness_timeout_fail on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_large_readiness_timeout on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_rolling_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_fast_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_update_autoscale on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_new_autoscaler_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_skyserve_failures on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_user_dependencies on aws +- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_file_mounts on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_using_file_mounts_with_env_vars on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_storage_mounts_with_stop on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_storage_mounts_with_stop on gcp + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_storage_mounts_with_stop on azure + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_kubernetes_storage_mounts on kubernetes + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_kubernetes_context_switch on kubernetes + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_docker_storage_mounts on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_bucket_external_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_public_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_nonexistent_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_private_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_list_source on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_invalid_names on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_aws_regions on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: TestStorageWithCredentials::test_gcs_regions on aws +- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_region on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_with_ssh_proxy_command on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_region_and_service_account on gcp + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_region on azure + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_zone on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_zone on gcp From a63a8c9ed4d1ef193b776a76d73a911435ce92d4 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 28 Nov 2024 16:54:52 +0800 Subject: [PATCH 39/64] remove runpod --- .buildkite/generate_pipeline.py | 2 +- .buildkite/pipeline_smoke_tests_release.yaml | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 6d88e2d48d2..721af70437f 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -15,7 +15,7 @@ ALL_CLOUDS_IN_SMOKE_TESTS = [ 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', - 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace' + 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod' ] diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index 6a3ec46d52d..65c38dfe774 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -222,12 +222,6 @@ steps: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) label: test_kubernetes_http_server_with_custom_ports on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_runpod_http_server_with_custom_ports on aws - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws env: LOG_TO_STDOUT: '1' From 7f75f9f919d66b186217cb3fae6e2ff94fc15900 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 14:02:07 +0800 Subject: [PATCH 40/64] hot fix to pass smoke test --- tests/smoke_tests/test_managed_job.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index e8d13c21354..e0d2a6a619b 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -710,6 +710,8 @@ def test_managed_jobs_storage(generic_cloud: str): job_name=name, job_status=[ManagedJobStatus.SUCCEEDED], timeout=60 + BUMP_UP_SECONDS), + # Wait for s3 backend refresh + f'sleep {BUMP_UP_SECONDS}', f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd From 64f928288f6ebc71b8f8b0e7ccdcac82a39c400f Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 14:10:41 +0800 Subject: [PATCH 41/64] random order --- .buildkite/generate_pipeline.py | 7 +- .buildkite/pipeline_smoke_tests_release.yaml | 748 +++++++++---------- 2 files changed, 380 insertions(+), 375 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 721af70437f..a6aa1a025b7 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -3,6 +3,7 @@ from collections import defaultdict import copy import os +import random from typing import Any, Dict, List import yaml @@ -131,8 +132,12 @@ def main(): file.write('# This is an auto-generated Buildkite pipeline by ' '.buildkite/generate_pipeline.py, Please do not ' 'edit directly.\n') + all_steps = [pipeline['steps'] for pipeline in pipelines] + # Shuffle the steps to avoid flakyness, consecutive runs of the same + # kind of test may fail for requiring locks on the same resources. + random.shuffle(all_steps) final_pipeline = { - 'steps': [pipeline['steps'] for pipeline in pipelines] + 'steps': all_steps } yaml.dump(final_pipeline, file, default_flow_style=False) diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index 65c38dfe774..59ad47bfc27 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -1,5 +1,86 @@ # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. steps: +- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_images on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_images on gcp + - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_azure_images on azure + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict on gcp + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict_region on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict_region on gcp + - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_aws_image_id_dict_zone on aws + - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_image_id_dict_zone on gcp + - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_clone_disk_aws on aws + - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_clone_disk_gcp on gcp + - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_mig on gcp + - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_force_enable_external_ips on gcp + - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_image_no_conda on aws + - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_custom_default_conda_env on aws - - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws env: LOG_TO_STDOUT: '1' @@ -106,763 +187,682 @@ steps: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws -- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws +- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker - --aws + label: test_managed_jobs on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_job_queue_with_docker on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws + label: test_job_pipeline on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_lambda_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + label: test_managed_jobs_failed_setup on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_job_queue_multinode on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + label: test_managed_jobs_pipeline_failed_setup on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_large_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue - --aws + label: test_managed_jobs_recovery_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_fast_large_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package + label: test_managed_jobs_recovery_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_docker_preinstalled_package on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + label: test_managed_jobs_pipeline_recovery_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_multi_echo on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + label: test_managed_jobs_pipeline_recovery_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_huggingface on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface + label: test_managed_jobs_recovery_default_resources on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_lambda_huggingface on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + label: test_managed_jobs_recovery_multi_node_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_inferentia on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + label: test_managed_jobs_recovery_multi_node_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_tpu on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + label: test_managed_jobs_cancellation_aws on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_tpu_vm on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + label: test_managed_jobs_cancellation_gcp on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_tpu_vm_pod on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke - --kubernetes + label: test_managed_jobs_storage on aws + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_tpu_pod_slice_gke on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + label: test_managed_jobs_tpu on gcp + - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_multi_hostname on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure + label: test_managed_jobs_inline_env on aws +- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_multi_node_failure on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp + label: test_file_mounts on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_http_server_with_custom_ports on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + label: test_using_file_mounts_with_env_vars on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_http_server_with_custom_ports on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + label: test_aws_storage_mounts_with_stop on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + label: test_gcp_storage_mounts_with_stop on gcp + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_http_server_with_custom_ports on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + label: test_azure_storage_mounts_with_stop on azure + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_http_server_with_custom_ports on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + label: test_kubernetes_storage_mounts on kubernetes + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_task_labels_aws on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + label: test_kubernetes_context_switch on kubernetes + - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_task_labels_gcp on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes - --kubernetes + label: test_docker_storage_mounts on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_task_labels_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch - --kubernetes + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes - --kubernetes + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_container_logs_multinode_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes + label: TestStorageWithCredentials::test_bucket_external_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_container_logs_two_jobs_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + label: TestStorageWithCredentials::test_public_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_distributed_tf on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + label: TestStorageWithCredentials::test_nonexistent_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_start_stop on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + label: TestStorageWithCredentials::test_private_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_start_stop on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_autostop on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_autodown on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_cancel_aws on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_cancel_gcp on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_cancel_azure on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_cancel_pytorch on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_use_spot on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_stop_gcp_spot on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_inline_env on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_inline_env_file on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_custom_image on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image - --kubernetes + label: TestStorageWithCredentials::test_list_source on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_custom_image on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure + label: TestStorageWithCredentials::test_invalid_names on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_start_stop_two_nodes on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_disk_tier on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_disk_tier on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + label: TestStorageWithCredentials::test_aws_regions on aws + - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_disk_tier on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover - --azure + label: TestStorageWithCredentials::test_gcs_regions on aws +- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_best_tier_failover on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + label: test_aws_region on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_zero_quota_failover on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + label: test_aws_with_ssh_proxy_command on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_zero_quota_failover on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_long_setup_run_script on aws -- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + label: test_gcp_region_and_service_account on gcp + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws + label: test_azure_region on azure + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_job_pipeline on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup - --aws + label: test_aws_zone on aws + - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_failed_setup on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup - --aws + label: test_gcp_zone on gcp +- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_pipeline_failed_setup on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + label: test_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_recovery_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp - --gcp + label: test_job_queue_with_docker on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_recovery_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + label: test_lambda_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_pipeline_recovery_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp - --gcp + label: test_job_queue_multinode on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_pipeline_recovery_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + label: test_large_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_recovery_default_resources on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + label: test_fast_large_job_queue on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_recovery_multi_node_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_recovery_multi_node_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws - --aws + label: test_docker_preinstalled_package on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_cancellation_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp - --gcp + label: test_multi_echo on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_cancellation_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + label: test_huggingface on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_storage on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_tpu on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env - --aws + label: test_lambda_huggingface on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_managed_jobs_inline_env on aws -- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + label: test_inferentia on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_images on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + label: test_tpu on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_images on gcp - - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + label: test_tpu_vm on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_images on azure - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + label: test_tpu_vm_pod on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_image_id_dict on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + label: test_tpu_pod_slice_gke on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_image_id_dict on gcp - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + label: test_multi_hostname on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_image_id_dict_region on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + label: test_multi_node_failure on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_image_id_dict_region on gcp - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone + label: test_gcp_http_server_with_custom_ports on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_image_id_dict_zone on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_image_id_dict_zone on gcp - - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + label: test_aws_http_server_with_custom_ports on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_clone_disk_aws on aws - - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + label: test_azure_http_server_with_custom_ports on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_clone_disk_gcp on gcp - - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + label: test_kubernetes_http_server_with_custom_ports on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_mig on gcp - - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp + label: test_task_labels_aws on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_force_enable_external_ips on gcp - - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + label: test_task_labels_gcp on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_image_no_conda on aws - - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env - --aws + label: test_task_labels_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_custom_default_conda_env on aws -- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_gcp_http on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_aws_http on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http - --azure + label: test_container_logs_multinode_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_azure_http on azure - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + label: test_container_logs_two_jobs_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_kubernetes_http on kubernetes - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_llm on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp + label: test_distributed_tf on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_spot_recovery on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback - --aws + label: test_gcp_start_stop on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_base_ondemand_fallback on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp + label: test_azure_start_stop on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_dynamic_ondemand_fallback on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --aws + label: test_autostop on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_user_bug_restart on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer - --aws + label: test_autodown on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_load_balancer on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp + label: test_cancel_aws on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_auto_restart on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + label: test_cancel_gcp on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_cancel on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + label: test_cancel_azure on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_streaming on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail - --aws + label: test_cancel_pytorch on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_readiness_timeout_fail on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --aws + label: test_use_spot on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_large_readiness_timeout on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws + label: test_stop_gcp_spot on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --aws + label: test_inline_env on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_rolling_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update - --aws + label: test_inline_env_file on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_fast_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --aws + label: test_aws_custom_image on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_update_autoscale on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --aws + label: test_kubernetes_custom_image on kubernetes + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_new_autoscaler_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws + label: test_azure_start_stop_two_nodes on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_failures on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + label: test_aws_disk_tier on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_user_dependencies on aws -- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts - --aws + label: test_gcp_disk_tier on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_file_mounts on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars - --aws + label: test_azure_disk_tier on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_using_file_mounts_with_env_vars on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + label: test_azure_best_tier_failover on azure + - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_storage_mounts_with_stop on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + label: test_aws_zero_quota_failover on aws + - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_storage_mounts_with_stop on gcp - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure + label: test_gcp_zero_quota_failover on gcp + - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_storage_mounts_with_stop on azure - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes + label: test_long_setup_run_script on aws +- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_storage_mounts on kubernetes - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes + label: test_skyserve_gcp_http on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_context_switch on kubernetes - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws + label: test_skyserve_aws_http on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http + --azure env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_docker_storage_mounts on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws + label: test_skyserve_azure_http on azure + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws + label: test_skyserve_kubernetes_http on kubernetes + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws + label: test_skyserve_llm on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + label: test_skyserve_spot_recovery on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_bucket_external_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion - --aws + label: test_skyserve_base_ondemand_fallback on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + label: test_skyserve_dynamic_ondemand_fallback on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_public_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + label: test_skyserve_user_bug_restart on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_nonexistent_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws + label: test_skyserve_load_balancer on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_private_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket - --aws + label: test_skyserve_auto_restart on gcp + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws + label: test_skyserve_cancel on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + label: test_skyserve_streaming on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_list_source on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + label: test_skyserve_readiness_timeout_fail on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_invalid_names on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws + label: test_skyserve_large_readiness_timeout on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + label: test_skyserve_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + label: test_skyserve_rolling_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_aws_regions on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + label: test_skyserve_fast_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_gcs_regions on aws -- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_region on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + label: test_skyserve_update_autoscale on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_with_ssh_proxy_command on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_region_and_service_account on gcp - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_region on azure - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + label: test_skyserve_new_autoscaler_update on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_zone on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + label: test_skyserve_failures on aws + - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_zone on gcp + label: test_user_dependencies on aws From 543ced443adb5c668f34cbad9316f49fc651f50b Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 21:46:24 +0800 Subject: [PATCH 42/64] allow parameter --- .buildkite/generate_pipeline.py | 121 +- .../pipeline_smoke_tests_pre_merge.yaml | 17 +- .buildkite/pipeline_smoke_tests_release.yaml | 1429 +++++++++-------- 3 files changed, 816 insertions(+), 751 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index a6aa1a025b7..5b1aded60fd 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -4,22 +4,34 @@ import copy import os import random -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import yaml DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] -# We only have credentials for aws, azure, and gcp. -# For those test cases that run on other clouds, -# we currently ignore them. -ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp', 'kubernetes'] ALL_CLOUDS_IN_SMOKE_TESTS = [ 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod' ] - - +QUEUE_GENERIC_CLOUD = 'generic_cloud' +QUEUE_KUBERNETES = 'kubernetes' +# Only aws, gcp, azure, and kubernetes are supported for now. +# Other clouds do not have credentials. +CLOUD_QUEUE_MAP = { + 'aws': QUEUE_GENERIC_CLOUD, + 'gcp': QUEUE_GENERIC_CLOUD, + 'azure': QUEUE_GENERIC_CLOUD, + 'kubernetes': QUEUE_KUBERNETES +} + +GENERATED_FILE_HEAD = ( + '# This is an auto-generated Buildkite pipeline by ' + '.buildkite/generate_pipeline.py, Please do not ' + 'edit directly.\n' +) + + def _get_full_decorator_path(decorator: ast.AST) -> str: """Recursively get the full path of a decorator.""" if isinstance(decorator, ast.Attribute): @@ -75,7 +87,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: ] final_clouds_to_include = [ cloud for cloud in clouds_to_include - if cloud in ALL_CLOUDS_WITH_CREDENTIALS + if cloud in CLOUD_QUEUE_MAP ] if clouds_to_include and not final_clouds_to_include: print(f'Warning: {file_path}:{node.name} ' @@ -89,7 +101,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: return function_cloud_map -def _generate_pipeline(test_file: str) -> Dict[str, Any]: +def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dict[str, Any]: """Generate a Buildkite pipeline from test files.""" steps = [] function_cloud_map = _extract_marked_tests(test_file) @@ -98,49 +110,86 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]: step = { 'label': f'{test_function} on {cloud}', 'command': f'pytest {test_file}::{test_function} --{cloud}', - 'env': { - 'LOG_TO_STDOUT': '1', - 'PYTHONPATH': '${PYTHONPATH}:$(pwd)' - } + 'agents': { + # Separate agent pool for each cloud. + # Since some are more costly + 'queue': CLOUD_QUEUE_MAP[cloud] + }, + 'if': f'build.env.{cloud} == \'1\'' } steps.append(step) - # we only run one cloud per test function for now - break + if one_cloud_per_test_function: + break return {'steps': steps} -def main(): - # List of test files to include in the pipeline - test_files = os.listdir('tests/smoke_tests') - output_file_pipelines_map = defaultdict(list) - - for test_file in test_files: - if not test_file.startswith('test_'): - continue - test_file_path = os.path.join('tests/smoke_tests', test_file) - if test_file == 'test_required_before_merge.py': - yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' - else: - yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml' - print(f'Converting {test_file_path} to {yaml_file_path}') - pipeline = _generate_pipeline(test_file_path) - output_file_pipelines_map[yaml_file_path].append(pipeline) - print(f'Converted {test_file_path} to {yaml_file_path}\n\n') +def _dump_pipeline_to_file( + output_file_pipelines_map: Dict[str, List[Dict[str, Any]]], + extra_env: Optional[Dict[str, str]] = None): + default_env = { + 'LOG_TO_STDOUT': '1', + 'PYTHONPATH': '${PYTHONPATH}:$(pwd)' + } + if extra_env: + default_env.update(extra_env) for yaml_file_path, pipelines in output_file_pipelines_map.items(): with open(yaml_file_path, 'w', encoding='utf-8') as file: - file.write('# This is an auto-generated Buildkite pipeline by ' - '.buildkite/generate_pipeline.py, Please do not ' - 'edit directly.\n') + file.write(GENERATED_FILE_HEAD) all_steps = [pipeline['steps'] for pipeline in pipelines] # Shuffle the steps to avoid flakyness, consecutive runs of the same # kind of test may fail for requiring locks on the same resources. random.shuffle(all_steps) final_pipeline = { - 'steps': all_steps + 'steps': all_steps, + 'env': default_env } yaml.dump(final_pipeline, file, default_flow_style=False) +def _convert_release(test_files: List[str]): + yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml' + output_file_pipelines_map = defaultdict(list) + for test_file in test_files: + print(f'Converting {test_file} to {yaml_file_path}') + # We only need to run one cloud per test function. + pipeline = _generate_pipeline(test_file, True) + output_file_pipelines_map[yaml_file_path].append(pipeline) + print(f'Converted {test_file} to {yaml_file_path}\n\n') + # Enable all clouds by default for release pipeline. + _dump_pipeline_to_file(output_file_pipelines_map, extra_env={ + cloud: '1' for cloud in CLOUD_QUEUE_MAP + }) + + +def _convert_pre_merge(test_files: List[str]): + yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' + output_file_pipelines_map = defaultdict(list) + for test_file in test_files: + print(f'Converting {test_file} to {yaml_file_path}') + # We want enable all clouds by default for each test function + # for pre-merge. And let the author controls which clouds + # to run by parameter. + pipeline = _generate_pipeline(test_file, False) + output_file_pipelines_map[yaml_file_path].append(pipeline) + print(f'Converted {test_file} to {yaml_file_path}\n\n') + _dump_pipeline_to_file(output_file_pipelines_map) + +def main(): + test_files = os.listdir('tests/smoke_tests') + release_files = [] + pre_merge_files = [] + for test_file in test_files: + if not test_file.startswith('test_'): + continue + test_file_path = os.path.join('tests/smoke_tests', test_file) + if "required_before_merge" in test_file: + pre_merge_files.append(test_file_path) + else: + release_files.append(test_file_path) + + _convert_release(release_files) + _convert_pre_merge(pre_merge_files) + if __name__ == '__main__': main() diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml index a1f68140299..be0e34876dc 100644 --- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml +++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml @@ -1,8 +1,17 @@ # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) steps: -- - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_yaml_launch_and_mount on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --azure + if: build.env.azure == '1' + label: test_yaml_launch_and_mount on azure diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index 59ad47bfc27..06a4d750931 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -1,868 +1,875 @@ # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. +env: + LOG_TO_STDOUT: '1' + PYTHONPATH: ${PYTHONPATH}:$(pwd) + aws: '1' + azure: '1' + gcp: '1' + kubernetes: '1' steps: -- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts + --aws + if: build.env.aws == '1' + label: test_file_mounts on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + --aws + if: build.env.aws == '1' + label: test_using_file_mounts_with_env_vars on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + --aws + if: build.env.aws == '1' + label: test_aws_storage_mounts_with_stop on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + if: build.env.gcp == '1' + label: test_gcp_storage_mounts_with_stop on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + if: build.env.azure == '1' + label: test_azure_storage_mounts_with_stop on azure + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_storage_mounts on kubernetes + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_context_switch on kubernetes + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts + --aws + if: build.env.aws == '1' + label: test_docker_storage_mounts on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_bucket_external_deletion on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_public_bucket on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_nonexistent_bucket on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_private_bucket on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_list_source on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_invalid_names on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_aws_regions on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_gcs_regions on aws +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + if: build.env.gcp == '1' + label: test_skyserve_gcp_http on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + if: build.env.aws == '1' + label: test_skyserve_aws_http on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http + --azure + if: build.env.azure == '1' + label: test_skyserve_azure_http on azure + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes + if: build.env.kubernetes == '1' + label: test_skyserve_kubernetes_http on kubernetes + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + if: build.env.aws == '1' + label: test_skyserve_llm on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp + if: build.env.gcp == '1' + label: test_skyserve_spot_recovery on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + --aws + if: build.env.aws == '1' + label: test_skyserve_base_ondemand_fallback on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + if: build.env.gcp == '1' + label: test_skyserve_dynamic_ondemand_fallback on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + --aws + if: build.env.aws == '1' + label: test_skyserve_user_bug_restart on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + --aws + if: build.env.aws == '1' + label: test_skyserve_load_balancer on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp + if: build.env.gcp == '1' + label: test_skyserve_auto_restart on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + if: build.env.aws == '1' + label: test_skyserve_cancel on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + if: build.env.aws == '1' + label: test_skyserve_streaming on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + --aws + if: build.env.aws == '1' + label: test_skyserve_readiness_timeout_fail on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + --aws + if: build.env.aws == '1' + label: test_skyserve_large_readiness_timeout on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws + if: build.env.aws == '1' + label: test_skyserve_update on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + --aws + if: build.env.aws == '1' + label: test_skyserve_rolling_update on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update + --aws + if: build.env.aws == '1' + label: test_skyserve_fast_update on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale + --aws + if: build.env.aws == '1' + label: test_skyserve_update_autoscale on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update + --aws + if: build.env.aws == '1' + label: test_skyserve_new_autoscaler_update on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws + if: build.env.aws == '1' + label: test_skyserve_failures on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + if: build.env.aws == '1' + label: test_user_dependencies on aws +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + if: build.env.aws == '1' label: test_aws_images on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + if: build.env.gcp == '1' label: test_gcp_images on gcp - - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + if: build.env.azure == '1' label: test_azure_images on azure - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + if: build.env.aws == '1' label: test_aws_image_id_dict on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + if: build.env.gcp == '1' label: test_gcp_image_id_dict on gcp - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_aws_image_id_dict_region on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_image_id_dict_region on gcp - - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_aws_image_id_dict_zone on aws - - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_image_id_dict_zone on gcp - - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + if: build.env.aws == '1' label: test_clone_disk_aws on aws - - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + if: build.env.gcp == '1' label: test_clone_disk_gcp on gcp - - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + if: build.env.gcp == '1' label: test_gcp_mig on gcp - - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_force_enable_external_ips on gcp - - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + if: build.env.aws == '1' label: test_image_no_conda on aws - - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_custom_default_conda_env on aws -- - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + if: build.env.aws == '1' label: test_example_app on aws - - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + if: build.env.aws == '1' label: test_minimal on aws - - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + if: build.env.aws == '1' label: test_launch_fast on aws - - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_launch_fast_with_autostop on aws - - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + if: build.env.aws == '1' label: test_stale_job on aws - - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_aws_stale_job_manual_restart on aws - - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_stale_job_manual_restart on gcp - - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + if: build.env.aws == '1' label: test_env_check on aws - - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + if: build.env.aws == '1' label: test_cli_logs on aws - - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_core_api_sky_launch_exec on gcp - - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_core_api_sky_launch_fast on aws - - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_multiple_accelerators_ordered on aws - - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_multiple_accelerators_ordered_with_default on aws - - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_multiple_accelerators_unordered on aws - - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_multiple_accelerators_unordered_with_default on aws - - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + if: build.env.aws == '1' label: test_multiple_resources on aws - - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws + if: build.env.aws == '1' label: test_sky_bench on aws - - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_kubernetes_context_failover on kubernetes - - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws -- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + if: build.env.aws == '1' + label: test_aws_region on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + --aws + if: build.env.aws == '1' + label: test_aws_with_ssh_proxy_command on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account + --gcp + if: build.env.gcp == '1' + label: test_gcp_region_and_service_account on gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + if: build.env.azure == '1' + label: test_azure_region on azure + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + if: build.env.aws == '1' + label: test_aws_zone on aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + if: build.env.gcp == '1' + label: test_gcp_zone on gcp +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + if: build.env.aws == '1' label: test_managed_jobs on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws + if: build.env.aws == '1' label: test_job_pipeline on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_failed_setup on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_pipeline_failed_setup on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_recovery_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_managed_jobs_recovery_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_pipeline_recovery_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_managed_jobs_pipeline_recovery_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_recovery_default_resources on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_recovery_multi_node_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_managed_jobs_recovery_multi_node_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_cancellation_aws on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_managed_jobs_cancellation_gcp on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_storage on aws - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + if: build.env.gcp == '1' label: test_managed_jobs_tpu on gcp - - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_managed_jobs_inline_env on aws -- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_file_mounts on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_using_file_mounts_with_env_vars on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_storage_mounts_with_stop on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_storage_mounts_with_stop on gcp - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_storage_mounts_with_stop on azure - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_storage_mounts on kubernetes - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_kubernetes_context_switch on kubernetes - - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_docker_storage_mounts on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_bucket_external_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_public_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_nonexistent_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_private_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_list_source on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_invalid_names on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_aws_regions on aws - - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: TestStorageWithCredentials::test_gcs_regions on aws -- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_region on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_with_ssh_proxy_command on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_region_and_service_account on gcp - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_azure_region on azure - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_aws_zone on aws - - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_gcp_zone on gcp -- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) +- - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws + if: build.env.aws == '1' label: test_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_job_queue_with_docker on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws + if: build.env.aws == '1' label: test_lambda_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_job_queue_multinode on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + if: build.env.aws == '1' label: test_large_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_fast_large_job_queue on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_docker_preinstalled_package on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + if: build.env.aws == '1' label: test_multi_echo on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + if: build.env.aws == '1' label: test_huggingface on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_lambda_huggingface on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + if: build.env.aws == '1' label: test_inferentia on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + if: build.env.gcp == '1' label: test_tpu on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + if: build.env.gcp == '1' label: test_tpu_vm on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + if: build.env.gcp == '1' label: test_tpu_vm_pod on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_tpu_pod_slice_gke on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + if: build.env.aws == '1' label: test_multi_hostname on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_multi_node_failure on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_http_server_with_custom_ports on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_aws_http_server_with_custom_ports on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.azure == '1' label: test_azure_http_server_with_custom_ports on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_kubernetes_http_server_with_custom_ports on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + if: build.env.aws == '1' label: test_task_labels_aws on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + if: build.env.gcp == '1' label: test_task_labels_gcp on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_task_labels_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_add_pod_annotations_for_autodown_with_launch on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_add_and_remove_pod_annotations_with_autostop on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_container_logs_multinode_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_container_logs_two_jobs_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + if: build.env.aws == '1' label: test_distributed_tf on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + if: build.env.gcp == '1' label: test_gcp_start_stop on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + if: build.env.azure == '1' label: test_azure_start_stop on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + if: build.env.aws == '1' label: test_autostop on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + if: build.env.aws == '1' label: test_autodown on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + if: build.env.aws == '1' label: test_cancel_aws on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + if: build.env.gcp == '1' label: test_cancel_gcp on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + if: build.env.azure == '1' label: test_cancel_azure on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws + if: build.env.aws == '1' label: test_cancel_pytorch on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + if: build.env.aws == '1' label: test_use_spot on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp + if: build.env.gcp == '1' label: test_stop_gcp_spot on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + if: build.env.aws == '1' label: test_inline_env on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + if: build.env.aws == '1' label: test_inline_env_file on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + if: build.env.aws == '1' label: test_aws_custom_image on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + - agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.kubernetes == '1' label: test_kubernetes_custom_image on kubernetes - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.azure == '1' label: test_azure_start_stop_two_nodes on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + if: build.env.aws == '1' label: test_aws_disk_tier on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + if: build.env.gcp == '1' label: test_gcp_disk_tier on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + if: build.env.azure == '1' label: test_azure_disk_tier on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.azure == '1' label: test_azure_best_tier_failover on azure - - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_aws_zero_quota_failover on aws - - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.gcp == '1' label: test_gcp_zero_quota_failover on gcp - - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + - agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) + if: build.env.aws == '1' label: test_long_setup_run_script on aws -- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_gcp_http on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_aws_http on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http - --azure - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_azure_http on azure - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http - --kubernetes - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_kubernetes_http on kubernetes - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_llm on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_spot_recovery on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_base_ondemand_fallback on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_dynamic_ondemand_fallback on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_user_bug_restart on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_load_balancer on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_auto_restart on gcp - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_cancel on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_streaming on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_readiness_timeout_fail on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_large_readiness_timeout on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_rolling_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_fast_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_update_autoscale on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_new_autoscaler_update on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_skyserve_failures on aws - - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - label: test_user_dependencies on aws From 2cff4bd74a1aef37fd08e6e812b245a9110b4bb5 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 22:51:18 +0800 Subject: [PATCH 43/64] bug fix --- .buildkite/generate_pipeline.py | 4 +- .../pipeline_smoke_tests_pre_merge.yaml | 24 +- .buildkite/pipeline_smoke_tests_release.yaml | 1722 ++++++++--------- 3 files changed, 871 insertions(+), 879 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 5b1aded60fd..45efa758844 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -136,7 +136,9 @@ def _dump_pipeline_to_file( for yaml_file_path, pipelines in output_file_pipelines_map.items(): with open(yaml_file_path, 'w', encoding='utf-8') as file: file.write(GENERATED_FILE_HEAD) - all_steps = [pipeline['steps'] for pipeline in pipelines] + all_steps = [] + for pipeline in pipelines: + all_steps.extend(pipeline['steps']) # Shuffle the steps to avoid flakyness, consecutive runs of the same # kind of test may fail for requiring locks on the same resources. random.shuffle(all_steps) diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml index be0e34876dc..41d2909b1f8 100644 --- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml +++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml @@ -3,15 +3,15 @@ env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) steps: -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --aws - if: build.env.aws == '1' - label: test_yaml_launch_and_mount on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --azure - if: build.env.azure == '1' - label: test_yaml_launch_and_mount on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --azure + if: build.env.azure == '1' + label: test_yaml_launch_and_mount on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --aws + if: build.env.aws == '1' + label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index 06a4d750931..928a79c0ded 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -7,869 +7,859 @@ env: gcp: '1' kubernetes: '1' steps: -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts - --aws - if: build.env.aws == '1' - label: test_file_mounts on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars - --aws - if: build.env.aws == '1' - label: test_using_file_mounts_with_env_vars on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop - --aws - if: build.env.aws == '1' - label: test_aws_storage_mounts_with_stop on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp - if: build.env.gcp == '1' - label: test_gcp_storage_mounts_with_stop on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - if: build.env.azure == '1' - label: test_azure_storage_mounts_with_stop on azure - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_storage_mounts on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_context_switch on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws - if: build.env.aws == '1' - label: test_docker_storage_mounts on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_bucket_external_deletion on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_public_bucket on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_nonexistent_bucket on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_private_bucket on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_list_source on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_invalid_names on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_aws_regions on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_gcs_regions on aws -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - if: build.env.gcp == '1' - label: test_skyserve_gcp_http on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws - if: build.env.aws == '1' - label: test_skyserve_aws_http on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http - --azure - if: build.env.azure == '1' - label: test_skyserve_azure_http on azure - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http - --kubernetes - if: build.env.kubernetes == '1' - label: test_skyserve_kubernetes_http on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws - if: build.env.aws == '1' - label: test_skyserve_llm on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - if: build.env.gcp == '1' - label: test_skyserve_spot_recovery on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback - --aws - if: build.env.aws == '1' - label: test_skyserve_base_ondemand_fallback on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - if: build.env.gcp == '1' - label: test_skyserve_dynamic_ondemand_fallback on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --aws - if: build.env.aws == '1' - label: test_skyserve_user_bug_restart on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer - --aws - if: build.env.aws == '1' - label: test_skyserve_load_balancer on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - if: build.env.gcp == '1' - label: test_skyserve_auto_restart on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws - if: build.env.aws == '1' - label: test_skyserve_cancel on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws - if: build.env.aws == '1' - label: test_skyserve_streaming on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail - --aws - if: build.env.aws == '1' - label: test_skyserve_readiness_timeout_fail on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --aws - if: build.env.aws == '1' - label: test_skyserve_large_readiness_timeout on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws - if: build.env.aws == '1' - label: test_skyserve_update on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --aws - if: build.env.aws == '1' - label: test_skyserve_rolling_update on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update - --aws - if: build.env.aws == '1' - label: test_skyserve_fast_update on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --aws - if: build.env.aws == '1' - label: test_skyserve_update_autoscale on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --aws - if: build.env.aws == '1' - label: test_skyserve_new_autoscaler_update on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws - if: build.env.aws == '1' - label: test_skyserve_failures on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - if: build.env.aws == '1' - label: test_user_dependencies on aws -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws - if: build.env.aws == '1' - label: test_aws_images on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - if: build.env.gcp == '1' - label: test_gcp_images on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - if: build.env.azure == '1' - label: test_azure_images on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region - --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict_region on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region - --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict_region on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone - --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict_zone on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone - --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict_zone on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws - if: build.env.aws == '1' - label: test_clone_disk_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp - if: build.env.gcp == '1' - label: test_clone_disk_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - if: build.env.gcp == '1' - label: test_gcp_mig on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp - if: build.env.gcp == '1' - label: test_gcp_force_enable_external_ips on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - if: build.env.aws == '1' - label: test_image_no_conda on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env - --aws - if: build.env.aws == '1' - label: test_custom_default_conda_env on aws -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws - if: build.env.aws == '1' - label: test_example_app on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws - if: build.env.aws == '1' - label: test_minimal on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws - if: build.env.aws == '1' - label: test_launch_fast on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop - --aws - if: build.env.aws == '1' - label: test_launch_fast_with_autostop on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws - if: build.env.aws == '1' - label: test_stale_job on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart - --aws - if: build.env.aws == '1' - label: test_aws_stale_job_manual_restart on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart - --gcp - if: build.env.gcp == '1' - label: test_gcp_stale_job_manual_restart on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws - if: build.env.aws == '1' - label: test_env_check on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws - if: build.env.aws == '1' - label: test_cli_logs on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec - --gcp - if: build.env.gcp == '1' - label: test_core_api_sky_launch_exec on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast - --aws - if: build.env.aws == '1' - label: test_core_api_sky_launch_fast on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered - --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_ordered on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default - --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_ordered_with_default on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered - --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_unordered on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default - --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_unordered_with_default on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws - if: build.env.aws == '1' - label: test_multiple_resources on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - if: build.env.aws == '1' - label: test_sky_bench on aws - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_context_failover on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent - --aws - if: build.env.aws == '1' - label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - if: build.env.aws == '1' - label: test_aws_region on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command - --aws - if: build.env.aws == '1' - label: test_aws_with_ssh_proxy_command on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - if: build.env.gcp == '1' - label: test_gcp_region_and_service_account on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - if: build.env.azure == '1' - label: test_azure_region on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws - if: build.env.aws == '1' - label: test_aws_zone on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - if: build.env.gcp == '1' - label: test_gcp_zone on gcp -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws - if: build.env.aws == '1' - label: test_managed_jobs on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws - if: build.env.aws == '1' - label: test_job_pipeline on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup - --aws - if: build.env.aws == '1' - label: test_managed_jobs_failed_setup on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup - --aws - if: build.env.aws == '1' - label: test_managed_jobs_pipeline_failed_setup on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_recovery_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp - --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_recovery_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_pipeline_recovery_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp - --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_pipeline_recovery_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources - --aws - if: build.env.aws == '1' - label: test_managed_jobs_recovery_default_resources on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_recovery_multi_node_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp - --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_recovery_multi_node_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_cancellation_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp - --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_cancellation_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage - --aws - if: build.env.aws == '1' - label: test_managed_jobs_storage on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_tpu on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env - --aws - if: build.env.aws == '1' - label: test_managed_jobs_inline_env on aws -- - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - if: build.env.aws == '1' - label: test_job_queue on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker - --aws - if: build.env.aws == '1' - label: test_job_queue_with_docker on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws - if: build.env.aws == '1' - label: test_lambda_job_queue on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode - --aws - if: build.env.aws == '1' - label: test_job_queue_multinode on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws - if: build.env.aws == '1' - label: test_large_job_queue on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue - --aws - if: build.env.aws == '1' - label: test_fast_large_job_queue on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package - --aws - if: build.env.aws == '1' - label: test_docker_preinstalled_package on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws - if: build.env.aws == '1' - label: test_multi_echo on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws - if: build.env.aws == '1' - label: test_huggingface on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface - --aws - if: build.env.aws == '1' - label: test_lambda_huggingface on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - if: build.env.aws == '1' - label: test_inferentia on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp - if: build.env.gcp == '1' - label: test_tpu on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - if: build.env.gcp == '1' - label: test_tpu_vm on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp - if: build.env.gcp == '1' - label: test_tpu_vm_pod on gcp - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke - --kubernetes - if: build.env.kubernetes == '1' - label: test_tpu_pod_slice_gke on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws - if: build.env.aws == '1' - label: test_multi_hostname on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure - --aws - if: build.env.aws == '1' - label: test_multi_node_failure on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp - if: build.env.gcp == '1' - label: test_gcp_http_server_with_custom_ports on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports - --aws - if: build.env.aws == '1' - label: test_aws_http_server_with_custom_ports on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports - --azure - if: build.env.azure == '1' - label: test_azure_http_server_with_custom_ports on azure - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_http_server_with_custom_ports on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws - if: build.env.aws == '1' - label: test_task_labels_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - if: build.env.gcp == '1' - label: test_task_labels_gcp on gcp - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_task_labels_kubernetes on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch - --kubernetes - if: build.env.kubernetes == '1' - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes - if: build.env.kubernetes == '1' - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_multinode_kubernetes on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_two_jobs_kubernetes on kubernetes - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws - if: build.env.aws == '1' - label: test_distributed_tf on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - if: build.env.gcp == '1' - label: test_gcp_start_stop on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure - if: build.env.azure == '1' - label: test_azure_start_stop on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws - if: build.env.aws == '1' - label: test_autostop on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws - if: build.env.aws == '1' - label: test_autodown on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - if: build.env.aws == '1' - label: test_cancel_aws on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - if: build.env.gcp == '1' - label: test_cancel_gcp on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - if: build.env.azure == '1' - label: test_cancel_azure on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - if: build.env.aws == '1' - label: test_cancel_pytorch on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - if: build.env.aws == '1' - label: test_use_spot on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - if: build.env.gcp == '1' - label: test_stop_gcp_spot on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - if: build.env.aws == '1' - label: test_inline_env on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - if: build.env.aws == '1' - label: test_inline_env_file on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - if: build.env.aws == '1' - label: test_aws_custom_image on aws - - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_custom_image on kubernetes - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure - if: build.env.azure == '1' - label: test_azure_start_stop_two_nodes on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws - if: build.env.aws == '1' - label: test_aws_disk_tier on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp - if: build.env.gcp == '1' - label: test_gcp_disk_tier on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - if: build.env.azure == '1' - label: test_azure_disk_tier on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover - --azure - if: build.env.azure == '1' - label: test_azure_best_tier_failover on azure - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover - --aws - if: build.env.aws == '1' - label: test_aws_zero_quota_failover on aws - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover - --gcp - if: build.env.gcp == '1' - label: test_gcp_zero_quota_failover on gcp - - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script - --aws - if: build.env.aws == '1' - label: test_long_setup_run_script on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_gcs_regions on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + if: build.env.gcp == '1' + label: test_clone_disk_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + if: build.env.azure == '1' + label: test_azure_region on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + if: build.env.aws == '1' + label: test_aws_disk_tier on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + --gcp + if: build.env.gcp == '1' + label: test_gcp_http_server_with_custom_ports on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + --aws + if: build.env.aws == '1' + label: test_fast_large_job_queue on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_http_server_with_custom_ports on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + if: build.env.aws == '1' + label: test_multi_echo on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + if: build.env.aws == '1' + label: test_minimal on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + --aws + if: build.env.aws == '1' + label: test_multiple_accelerators_unordered on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + if: build.env.gcp == '1' + label: test_gcp_mig on gcp +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_context_switch on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + if: build.env.aws == '1' + label: test_autodown on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + --aws + if: build.env.aws == '1' + label: test_multiple_accelerators_unordered_with_default on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + if: build.env.aws == '1' + label: test_skyserve_streaming on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + --aws + if: build.env.aws == '1' + label: test_managed_jobs_recovery_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + --aws + if: build.env.aws == '1' + label: test_managed_jobs_pipeline_recovery_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + --aws + if: build.env.aws == '1' + label: test_aws_with_ssh_proxy_command on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp + if: build.env.gcp == '1' + label: test_skyserve_spot_recovery on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + --aws + if: build.env.aws == '1' + label: test_skyserve_rolling_update on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp + if: build.env.gcp == '1' + label: test_gcp_image_id_dict_zone on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + --aws + if: build.env.aws == '1' + label: test_multiple_accelerators_ordered on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + if: build.env.aws == '1' + label: test_aws_images on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + if: build.env.azure == '1' + label: test_azure_storage_mounts_with_stop on azure +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop + --kubernetes + if: build.env.kubernetes == '1' + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account + --gcp + if: build.env.gcp == '1' + label: test_gcp_region_and_service_account on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + if: build.env.aws == '1' + label: test_inline_env on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + if: build.env.azure == '1' + label: test_azure_disk_tier on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + --aws + if: build.env.aws == '1' + label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp + if: build.env.gcp == '1' + label: test_stop_gcp_spot on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + --aws + if: build.env.aws == '1' + label: test_skyserve_large_readiness_timeout on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart + --aws + if: build.env.aws == '1' + label: test_aws_stale_job_manual_restart on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + if: build.env.aws == '1' + label: test_image_no_conda on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + --aws + if: build.env.aws == '1' + label: test_managed_jobs_storage on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + --aws + if: build.env.aws == '1' + label: test_managed_jobs_recovery_multi_node_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart + --gcp + if: build.env.gcp == '1' + label: test_gcp_stale_job_manual_restart on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + --aws + if: build.env.aws == '1' + label: test_aws_http_server_with_custom_ports on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + if: build.env.gcp == '1' + label: test_cancel_gcp on gcp +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + --kubernetes + if: build.env.kubernetes == '1' + label: test_task_labels_kubernetes on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws + if: build.env.aws == '1' + label: test_core_api_sky_launch_fast on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + --aws + if: build.env.aws == '1' + label: test_skyserve_base_ondemand_fallback on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + if: build.env.aws == '1' + label: test_clone_disk_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + if: build.env.aws == '1' + label: test_autostop on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + --aws + if: build.env.aws == '1' + label: test_managed_jobs_pipeline_failed_setup on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + if: build.env.gcp == '1' + label: test_skyserve_dynamic_ondemand_fallback on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws + if: build.env.aws == '1' + label: test_job_pipeline on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_public_bucket on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_private_bucket on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + if: build.env.gcp == '1' + label: test_task_labels_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws + if: build.env.aws == '1' + label: test_multi_node_failure on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + --kubernetes + if: build.env.kubernetes == '1' + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts + --aws + if: build.env.aws == '1' + label: test_docker_storage_mounts on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + if: build.env.aws == '1' + label: test_huggingface on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws + if: build.env.aws == '1' + label: test_file_mounts on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + --gcp + if: build.env.gcp == '1' + label: test_gcp_zero_quota_failover on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws + if: build.env.aws == '1' + label: test_lambda_huggingface on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + if: build.env.gcp == '1' + label: test_gcp_image_id_dict on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + --aws + if: build.env.aws == '1' + label: test_aws_storage_mounts_with_stop on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + --aws + if: build.env.aws == '1' + label: test_launch_fast_with_autostop on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + if: build.env.aws == '1' + label: test_aws_image_id_dict on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp + if: build.env.gcp == '1' + label: test_gcp_image_id_dict_region on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure + if: build.env.azure == '1' + label: test_azure_best_tier_failover on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + --aws + if: build.env.aws == '1' + label: test_skyserve_readiness_timeout_fail on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + if: build.env.aws == '1' + label: test_multi_hostname on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default + --aws + if: build.env.aws == '1' + label: test_multiple_accelerators_ordered_with_default on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure + if: build.env.azure == '1' + label: test_skyserve_azure_http on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + --aws + if: build.env.aws == '1' + label: test_job_queue_multinode on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on + aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws + if: build.env.aws == '1' + label: test_skyserve_fast_update on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + if: build.env.aws == '1' + label: test_skyserve_aws_http on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes + --kubernetes + if: build.env.kubernetes == '1' + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update + --aws + if: build.env.aws == '1' + label: test_skyserve_new_autoscaler_update on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_storage_mounts on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + if: build.env.azure == '1' + label: test_azure_images on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + if: build.env.aws == '1' + label: test_use_spot on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + if: build.env.gcp == '1' + label: test_managed_jobs_tpu on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws + if: build.env.aws == '1' + label: test_skyserve_update on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + if: build.env.gcp == '1' + label: test_gcp_start_stop on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + --aws + if: build.env.aws == '1' + label: test_skyserve_user_bug_restart on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + if: build.env.aws == '1' + label: test_env_check on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp + if: build.env.gcp == '1' + label: test_skyserve_auto_restart on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + if: build.env.gcp == '1' + label: test_gcp_storage_mounts_with_stop on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + --aws + if: build.env.aws == '1' + label: test_managed_jobs_inline_env on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + --aws + if: build.env.aws == '1' + label: test_using_file_mounts_with_env_vars on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + if: build.env.aws == '1' + label: test_aws_custom_image on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws + if: build.env.aws == '1' + label: test_job_queue on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + --aws + if: build.env.aws == '1' + label: test_long_setup_run_script on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws + if: build.env.aws == '1' + label: test_skyserve_failures on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_custom_image on kubernetes +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes + if: build.env.kubernetes == '1' + label: test_skyserve_kubernetes_http on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + if: build.env.gcp == '1' + label: test_tpu_vm on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + if: build.env.aws == '1' + label: test_inline_env_file on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + --aws + if: build.env.aws == '1' + label: test_skyserve_load_balancer on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp + if: build.env.gcp == '1' + label: test_managed_jobs_pipeline_recovery_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + if: build.env.aws == '1' + label: test_distributed_tf on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + --aws + if: build.env.aws == '1' + label: test_managed_jobs_failed_setup on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp + if: build.env.gcp == '1' + label: test_managed_jobs_recovery_multi_node_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + if: build.env.aws == '1' + label: test_multiple_resources on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes + if: build.env.kubernetes == '1' + label: test_container_logs_two_jobs_kubernetes on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws + if: build.env.aws == '1' + label: test_cancel_pytorch on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + if: build.env.aws == '1' + label: test_stale_job on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + if: build.env.gcp == '1' + label: test_gcp_force_enable_external_ips on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + if: build.env.azure == '1' + label: test_cancel_azure on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_aws_regions on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_nonexistent_bucket on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale + --aws + if: build.env.aws == '1' + label: test_skyserve_update_autoscale on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + --aws + if: build.env.aws == '1' + label: test_job_queue_with_docker on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + --kubernetes + if: build.env.kubernetes == '1' + label: test_kubernetes_context_failover on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + if: build.env.aws == '1' + label: test_cli_logs on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + --aws + if: build.env.aws == '1' + label: test_aws_zero_quota_failover on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + if: build.env.aws == '1' + label: test_aws_zone on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + if: build.env.aws == '1' + label: test_cancel_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws + if: build.env.aws == '1' + label: test_aws_image_id_dict_region on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws + if: build.env.aws == '1' + label: test_lambda_job_queue on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure + if: build.env.azure == '1' + label: test_azure_start_stop_two_nodes on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + if: build.env.aws == '1' + label: test_task_labels_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package + --aws + if: build.env.aws == '1' + label: test_docker_preinstalled_package on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + --azure + if: build.env.azure == '1' + label: test_azure_http_server_with_custom_ports on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + if: build.env.gcp == '1' + label: test_tpu_vm_pod on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + if: build.env.aws == '1' + label: test_launch_fast on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_bucket_external_deletion on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws + if: build.env.aws == '1' + label: test_custom_default_conda_env on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_invalid_names on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + if: build.env.aws == '1' + label: test_aws_region on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + if: build.env.aws == '1' + label: test_example_app on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + if: build.env.aws == '1' + label: test_skyserve_cancel on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + --aws + if: build.env.aws == '1' + label: test_managed_jobs_cancellation_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + if: build.env.aws == '1' + label: test_inferentia on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + if: build.env.gcp == '1' + label: test_skyserve_gcp_http on gcp +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes + if: build.env.kubernetes == '1' + label: test_tpu_pod_slice_gke on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + if: build.env.gcp == '1' + label: test_gcp_images on gcp +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + --kubernetes + if: build.env.kubernetes == '1' + label: test_container_logs_multinode_kubernetes on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + if: build.env.gcp == '1' + label: test_tpu on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp + if: build.env.gcp == '1' + label: test_managed_jobs_cancellation_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + if: build.env.aws == '1' + label: test_skyserve_llm on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws + if: build.env.aws == '1' + label: test_sky_bench on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + if: build.env.aws == '1' + label: test_large_job_queue on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + if: build.env.gcp == '1' + label: test_gcp_disk_tier on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + if: build.env.aws == '1' + label: test_user_dependencies on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + if: build.env.aws == '1' + label: test_managed_jobs on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws + if: build.env.aws == '1' + label: TestStorageWithCredentials::test_list_source on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp + if: build.env.gcp == '1' + label: test_core_api_sky_launch_exec on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws + if: build.env.aws == '1' + label: test_aws_image_id_dict_zone on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + if: build.env.gcp == '1' + label: test_gcp_zone on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + --gcp + if: build.env.gcp == '1' + label: test_managed_jobs_recovery_gcp on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + if: build.env.azure == '1' + label: test_azure_start_stop on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + --aws + if: build.env.aws == '1' + label: test_managed_jobs_recovery_default_resources on aws From 19fc691fcf72f1d9cc237c3e4136149aaa49c80d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 23:03:07 +0800 Subject: [PATCH 44/64] bug fix --- .buildkite/generate_pipeline.py | 2 +- .../pipeline_smoke_tests_pre_merge.yaml | 12 +- .buildkite/pipeline_smoke_tests_release.yaml | 1052 ++++++++--------- 3 files changed, 533 insertions(+), 533 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 45efa758844..3c3b9c41edf 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -115,7 +115,7 @@ def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dic # Since some are more costly 'queue': CLOUD_QUEUE_MAP[cloud] }, - 'if': f'build.env.{cloud} == \'1\'' + 'if': f'build.env("{cloud}") == "1"' } steps.append(step) if one_cloud_per_test_function: diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml index 41d2909b1f8..35ba7ea17ec 100644 --- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml +++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml @@ -6,12 +6,12 @@ steps: - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --azure - if: build.env.azure == '1' - label: test_yaml_launch_and_mount on azure + --aws + if: build.env("aws") == "1" + label: test_yaml_launch_and_mount on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --aws - if: build.env.aws == '1' - label: test_yaml_launch_and_mount on aws + --azure + if: build.env("azure") == "1" + label: test_yaml_launch_and_mount on azure diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index 928a79c0ded..fb22f52afec 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -9,857 +9,857 @@ env: steps: - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_gcs_regions on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery + --gcp + if: build.env("gcp") == "1" + label: test_skyserve_spot_recovery on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp - if: build.env.gcp == '1' - label: test_clone_disk_gcp on gcp + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + if: build.env("azure") == "1" + label: test_azure_storage_mounts_with_stop on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - if: build.env.azure == '1' - label: test_azure_region on azure + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_list_source on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws - if: build.env.aws == '1' - label: test_aws_disk_tier on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + --aws + if: build.env("aws") == "1" + label: test_managed_jobs_cancellation_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp - if: build.env.gcp == '1' - label: test_gcp_http_server_with_custom_ports on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + if: build.env("aws") == "1" + label: test_autostop on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue - --aws - if: build.env.aws == '1' - label: test_fast_large_job_queue on aws + command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + if: build.env("aws") == "1" + label: test_cli_logs on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + if: build.env("gcp") == "1" + label: test_gcp_disk_tier on gcp - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_http_server_with_custom_ports on kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_custom_image on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws - if: build.env.aws == '1' - label: test_multi_echo on aws + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + if: build.env("aws") == "1" + label: test_launch_fast on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws - if: build.env.aws == '1' - label: test_minimal on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure + if: build.env("azure") == "1" + label: test_skyserve_azure_http on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_unordered on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - if: build.env.gcp == '1' - label: test_gcp_mig on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_context_switch on kubernetes + if: build.env("aws") == "1" + label: test_managed_jobs_storage on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws - if: build.env.aws == '1' - label: test_autodown on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_aws_regions on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_nonexistent_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_unordered_with_default on aws + if: build.env("aws") == "1" + label: test_long_setup_run_script on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws - if: build.env.aws == '1' - label: test_skyserve_streaming on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_private_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + --gcp + if: build.env("gcp") == "1" + label: test_gcp_http_server_with_custom_ports on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_recovery_aws on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws + if: build.env("aws") == "1" + label: test_lambda_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws - --aws - if: build.env.aws == '1' - label: test_managed_jobs_pipeline_recovery_aws on aws + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + if: build.env("aws") == "1" + label: test_clone_disk_aws on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command --aws - if: build.env.aws == '1' + if: build.env("aws") == "1" label: test_aws_with_ssh_proxy_command on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws + if: build.env("aws") == "1" + label: test_launch_fast_with_autostop on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - if: build.env.gcp == '1' - label: test_skyserve_spot_recovery on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws + if: build.env("aws") == "1" + label: test_cancel_pytorch on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --aws - if: build.env.aws == '1' - label: test_skyserve_rolling_update on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + if: build.env("aws") == "1" + label: test_inline_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict_zone on gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + if: build.env("gcp") == "1" + label: test_skyserve_gcp_http on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_ordered on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws - if: build.env.aws == '1' - label: test_aws_images on aws + if: build.env("aws") == "1" + label: test_fast_large_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - if: build.env.azure == '1' - label: test_azure_storage_mounts_with_stop on azure -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes - if: build.env.kubernetes == '1' - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws + if: build.env("aws") == "1" + label: test_file_mounts on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - if: build.env.gcp == '1' - label: test_gcp_region_and_service_account on gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + if: build.env("azure") == "1" + label: test_azure_region on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - if: build.env.aws == '1' - label: test_inline_env on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on + aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - if: build.env.azure == '1' - label: test_azure_disk_tier on azure + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_pipeline_recovery_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env --aws - if: build.env.aws == '1' - label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws + if: build.env("aws") == "1" + label: test_managed_jobs_inline_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - if: build.env.gcp == '1' - label: test_stop_gcp_spot on gcp + command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + if: build.env("azure") == "1" + label: test_azure_images on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --aws - if: build.env.aws == '1' - label: test_skyserve_large_readiness_timeout on aws + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + if: build.env("gcp") == "1" + label: test_gcp_zone on gcp - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart - --aws - if: build.env.aws == '1' - label: test_aws_stale_job_manual_restart on aws + queue: kubernetes + command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_context_failover on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - if: build.env.aws == '1' - label: test_image_no_conda on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + if: build.env("aws") == "1" + label: test_task_labels_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage - --aws - if: build.env.aws == '1' - label: test_managed_jobs_storage on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws + if: build.env("aws") == "1" + label: test_job_pipeline on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws --aws - if: build.env.aws == '1' + if: build.env("aws") == "1" label: test_managed_jobs_recovery_multi_node_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart - --gcp - if: build.env.gcp == '1' - label: test_gcp_stale_job_manual_restart on gcp + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + if: build.env("aws") == "1" + label: test_aws_image_id_dict on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports - --aws - if: build.env.aws == '1' - label: test_aws_http_server_with_custom_ports on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + if: build.env("aws") == "1" + label: test_skyserve_llm on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - if: build.env.gcp == '1' - label: test_cancel_gcp on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_task_labels_kubernetes on kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure + if: build.env("azure") == "1" + label: test_azure_best_tier_failover on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws - if: build.env.aws == '1' - label: test_core_api_sky_launch_fast on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws + if: build.env("aws") == "1" + label: test_inferentia on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes + if: build.env("kubernetes") == "1" + label: test_tpu_pod_slice_gke on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup --aws - if: build.env.aws == '1' - label: test_skyserve_base_ondemand_fallback on aws + if: build.env("aws") == "1" + label: test_managed_jobs_failed_setup on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws - if: build.env.aws == '1' - label: test_clone_disk_aws on aws + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + if: build.env("gcp") == "1" + label: test_clone_disk_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws - if: build.env.aws == '1' - label: test_autostop on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws + if: build.env("aws") == "1" + label: test_large_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + if: build.env("gcp") == "1" + label: test_cancel_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports --aws - if: build.env.aws == '1' - label: test_managed_jobs_pipeline_failed_setup on aws + if: build.env("aws") == "1" + label: test_aws_http_server_with_custom_ports on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - if: build.env.gcp == '1' - label: test_skyserve_dynamic_ondemand_fallback on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + if: build.env("gcp") == "1" + label: test_task_labels_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws - if: build.env.aws == '1' - label: test_job_pipeline on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp + if: build.env("gcp") == "1" + label: test_gcp_images on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_public_bucket on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_tpu on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_private_bucket on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws + if: build.env("aws") == "1" + label: test_multi_node_failure on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws + if: build.env("aws") == "1" + label: test_docker_storage_mounts on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - if: build.env.gcp == '1' - label: test_task_labels_gcp on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + if: build.env("aws") == "1" + label: test_aws_custom_image on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws - if: build.env.aws == '1' - label: test_multi_node_failure on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + if: build.env("gcp") == "1" + label: test_tpu_vm_pod on gcp - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts --kubernetes - if: build.env.kubernetes == '1' - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_storage_mounts on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws - if: build.env.aws == '1' - label: test_docker_storage_mounts on aws + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + if: build.env("aws") == "1" + label: test_aws_region on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws - if: build.env.aws == '1' - label: test_huggingface on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + if: build.env("aws") == "1" + label: test_use_spot on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws - if: build.env.aws == '1' - label: test_file_mounts on aws + command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + if: build.env("aws") == "1" + label: test_example_app on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover - --gcp - if: build.env.gcp == '1' - label: test_gcp_zero_quota_failover on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + if: build.env("aws") == "1" + label: test_cancel_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws - if: build.env.aws == '1' - label: test_lambda_huggingface on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + if: build.env("aws") == "1" + label: test_skyserve_aws_http on aws - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict on gcp + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes + if: build.env("kubernetes") == "1" + label: test_container_logs_two_jobs_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket --aws - if: build.env.aws == '1' - label: test_aws_storage_mounts_with_stop on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_public_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop - --aws - if: build.env.aws == '1' - label: test_launch_fast_with_autostop on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + if: build.env("aws") == "1" + label: test_distributed_tf on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region - --gcp - if: build.env.gcp == '1' - label: test_gcp_image_id_dict_region on gcp + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp + if: build.env("gcp") == "1" + label: test_core_api_sky_launch_exec on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover - --azure - if: build.env.azure == '1' - label: test_azure_best_tier_failover on azure + command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + if: build.env("aws") == "1" + label: test_multiple_resources on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + --kubernetes + if: build.env("kubernetes") == "1" + label: test_task_labels_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update --aws - if: build.env.aws == '1' - label: test_skyserve_readiness_timeout_fail on aws + if: build.env("aws") == "1" + label: test_skyserve_new_autoscaler_update on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws - if: build.env.aws == '1' - label: test_multi_hostname on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure + if: build.env("azure") == "1" + label: test_azure_disk_tier on azure - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default - --aws - if: build.env.aws == '1' - label: test_multiple_accelerators_ordered_with_default on aws + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_context_switch on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure - if: build.env.azure == '1' - label: test_skyserve_azure_http on azure + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws + if: build.env("aws") == "1" + label: test_skyserve_update on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package --aws - if: build.env.aws == '1' - label: test_job_queue_multinode on aws + if: build.env("aws") == "1" + label: test_docker_preinstalled_package on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on - aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_gcs_regions on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws - if: build.env.aws == '1' - label: test_skyserve_fast_update on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + --aws + if: build.env("aws") == "1" + label: test_skyserve_large_readiness_timeout on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws - if: build.env.aws == '1' - label: test_skyserve_aws_http on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + --aws + if: build.env("aws") == "1" + label: test_job_queue_multinode on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --aws - if: build.env.aws == '1' - label: test_skyserve_new_autoscaler_update on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + if: build.env("aws") == "1" + label: test_skyserve_streaming on aws - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_storage_mounts on kubernetes + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + if: build.env("gcp") == "1" + label: test_tpu on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - if: build.env.azure == '1' - label: test_azure_images on azure + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + --aws + if: build.env("aws") == "1" + label: test_multiple_accelerators_unordered on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - if: build.env.aws == '1' - label: test_use_spot on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_cancellation_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_tpu on gcp + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default + --aws + if: build.env("aws") == "1" + label: test_multiple_accelerators_ordered_with_default on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws - if: build.env.aws == '1' - label: test_skyserve_update on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp + if: build.env("gcp") == "1" + label: test_gcp_image_id_dict_region on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - if: build.env.gcp == '1' - label: test_gcp_start_stop on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + if: build.env("azure") == "1" + label: test_cancel_azure on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --aws - if: build.env.aws == '1' - label: test_skyserve_user_bug_restart on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + if: build.env("gcp") == "1" + label: test_gcp_force_enable_external_ips on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws - if: build.env.aws == '1' - label: test_env_check on aws + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws + if: build.env("aws") == "1" + label: test_core_api_sky_launch_fast on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - if: build.env.gcp == '1' - label: test_skyserve_auto_restart on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure + if: build.env("azure") == "1" + label: test_azure_start_stop_two_nodes on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp - if: build.env.gcp == '1' - label: test_gcp_storage_mounts_with_stop on gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + if: build.env("aws") == "1" + label: test_aws_zone on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover --aws - if: build.env.aws == '1' - label: test_managed_jobs_inline_env on aws + if: build.env("aws") == "1" + label: test_aws_zero_quota_failover on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars --aws - if: build.env.aws == '1' + if: build.env("aws") == "1" label: test_using_file_mounts_with_env_vars on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - if: build.env.aws == '1' - label: test_aws_custom_image on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - if: build.env.aws == '1' - label: test_job_queue on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default --aws - if: build.env.aws == '1' - label: test_long_setup_run_script on aws + if: build.env("aws") == "1" + label: test_multiple_accelerators_unordered_with_default on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws - if: build.env.aws == '1' - label: test_skyserve_failures on aws + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered + --aws + if: build.env("aws") == "1" + label: test_multiple_accelerators_ordered on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_custom_image on kubernetes + if: build.env("kubernetes") == "1" + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http - --kubernetes - if: build.env.kubernetes == '1' - label: test_skyserve_kubernetes_http on kubernetes + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + if: build.env("aws") == "1" + label: test_minimal on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - if: build.env.gcp == '1' - label: test_tpu_vm on gcp + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + if: build.env("gcp") == "1" + label: test_gcp_storage_mounts_with_stop on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - if: build.env.aws == '1' - label: test_inline_env_file on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws + if: build.env("aws") == "1" + label: test_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + if: build.env("aws") == "1" + label: test_aws_images on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion --aws - if: build.env.aws == '1' - label: test_skyserve_load_balancer on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_bucket_external_deletion on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp - --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_pipeline_recovery_gcp on gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + --aws + if: build.env("aws") == "1" + label: test_managed_jobs_pipeline_recovery_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws - if: build.env.aws == '1' - label: test_distributed_tf on aws + command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws + if: build.env("aws") == "1" + label: test_image_no_conda on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws + if: build.env("aws") == "1" + label: test_user_dependencies on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup + command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart --aws - if: build.env.aws == '1' - label: test_managed_jobs_failed_setup on aws + if: build.env("aws") == "1" + label: test_aws_stale_job_manual_restart on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_recovery_multi_node_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws - if: build.env.aws == '1' - label: test_multiple_resources on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_two_jobs_kubernetes on kubernetes + if: build.env("gcp") == "1" + label: test_gcp_zero_quota_failover on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - if: build.env.aws == '1' - label: test_cancel_pytorch on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp + if: build.env("gcp") == "1" + label: test_gcp_image_id_dict_zone on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws - if: build.env.aws == '1' - label: test_stale_job on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + if: build.env("azure") == "1" + label: test_azure_start_stop on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp - if: build.env.gcp == '1' - label: test_gcp_force_enable_external_ips on gcp + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws + if: build.env("aws") == "1" + label: test_aws_image_id_dict_zone on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - if: build.env.azure == '1' - label: test_cancel_azure on azure + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws + if: build.env("aws") == "1" + label: test_aws_image_id_dict_region on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_aws_regions on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + if: build.env("aws") == "1" + label: test_inline_env_file on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_nonexistent_bucket on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + if: build.env("aws") == "1" + label: test_managed_jobs on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names --aws - if: build.env.aws == '1' - label: test_skyserve_update_autoscale on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_invalid_names on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup --aws - if: build.env.aws == '1' - label: test_job_queue_with_docker on aws + if: build.env("aws") == "1" + label: test_managed_jobs_pipeline_failed_setup on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop --kubernetes - if: build.env.kubernetes == '1' - label: test_kubernetes_context_failover on kubernetes + if: build.env("kubernetes") == "1" + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws - if: build.env.aws == '1' - label: test_cli_logs on aws + command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + if: build.env("aws") == "1" + label: test_stale_job on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update --aws - if: build.env.aws == '1' - label: test_aws_zero_quota_failover on aws + if: build.env("aws") == "1" + label: test_skyserve_rolling_update on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws - if: build.env.aws == '1' - label: test_aws_zone on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + if: build.env("aws") == "1" + label: test_autodown on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - if: build.env.aws == '1' - label: test_cancel_aws on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp + if: build.env("gcp") == "1" + label: test_tpu_vm on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict_region on aws + if: build.env("aws") == "1" + label: test_managed_jobs_recovery_default_resources on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws - if: build.env.aws == '1' - label: test_lambda_job_queue on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws + if: build.env("aws") == "1" + label: test_lambda_huggingface on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws + if: build.env("aws") == "1" + label: test_job_queue_with_docker on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure - if: build.env.azure == '1' - label: test_azure_start_stop_two_nodes on azure + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws + if: build.env("aws") == "1" + label: test_skyserve_failures on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws - if: build.env.aws == '1' - label: test_task_labels_aws on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + --aws + if: build.env("aws") == "1" + label: test_skyserve_base_ondemand_fallback on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package - --aws - if: build.env.aws == '1' - label: test_docker_preinstalled_package on aws + command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws + if: build.env("aws") == "1" + label: test_sky_bench on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports - --azure - if: build.env.azure == '1' - label: test_azure_http_server_with_custom_ports on azure + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + if: build.env("aws") == "1" + label: test_skyserve_cancel on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp - if: build.env.gcp == '1' - label: test_tpu_vm_pod on gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + if: build.env("gcp") == "1" + label: test_skyserve_dynamic_ondemand_fallback on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws - if: build.env.aws == '1' - label: test_launch_fast on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_recovery_multi_node_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_bucket_external_deletion on aws + if: build.env("aws") == "1" + label: test_skyserve_update_autoscale on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + if: build.env("aws") == "1" + label: test_env_check on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws + if: build.env("aws") == "1" + label: test_skyserve_fast_update on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion --aws - if: build.env.aws == '1' - label: test_custom_default_conda_env on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_invalid_names on aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - if: build.env.aws == '1' - label: test_aws_region on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + if: build.env("aws") == "1" + label: test_multi_hostname on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws - if: build.env.aws == '1' - label: test_example_app on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + if: build.env("aws") == "1" + label: test_multi_echo on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_http_server_with_custom_ports on kubernetes +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + --kubernetes + if: build.env("kubernetes") == "1" + label: test_skyserve_kubernetes_http on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws - if: build.env.aws == '1' - label: test_skyserve_cancel on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + if: build.env("aws") == "1" + label: test_huggingface on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer --aws - if: build.env.aws == '1' - label: test_managed_jobs_cancellation_aws on aws + if: build.env("aws") == "1" + label: test_skyserve_load_balancer on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - if: build.env.aws == '1' - label: test_inferentia on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp + if: build.env("gcp") == "1" + label: test_stop_gcp_spot on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - if: build.env.gcp == '1' - label: test_skyserve_gcp_http on gcp + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes - if: build.env.kubernetes == '1' - label: test_tpu_pod_slice_gke on kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + --kubernetes + if: build.env("kubernetes") == "1" + label: test_container_logs_multinode_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - if: build.env.gcp == '1' - label: test_gcp_images on gcp + command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent + --aws + if: build.env("aws") == "1" + label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch --kubernetes - if: build.env.kubernetes == '1' - label: test_container_logs_multinode_kubernetes on kubernetes + if: build.env("kubernetes") == "1" + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp - if: build.env.gcp == '1' - label: test_tpu on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports + --azure + if: build.env("azure") == "1" + label: test_azure_http_server_with_custom_ports on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_cancellation_gcp on gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_recovery_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws - if: build.env.aws == '1' - label: test_skyserve_llm on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + --aws + if: build.env("aws") == "1" + label: test_aws_storage_mounts_with_stop on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - if: build.env.aws == '1' - label: test_sky_bench on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp + if: build.env("gcp") == "1" + label: test_gcp_start_stop on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws - if: build.env.aws == '1' - label: test_large_job_queue on aws + command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws + if: build.env("aws") == "1" + label: test_custom_default_conda_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp - if: build.env.gcp == '1' - label: test_gcp_disk_tier on gcp + command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart + --gcp + if: build.env("gcp") == "1" + label: test_gcp_stale_job_manual_restart on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - if: build.env.aws == '1' - label: test_user_dependencies on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + if: build.env("gcp") == "1" + label: test_gcp_mig on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws - if: build.env.aws == '1' - label: test_managed_jobs on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp + if: build.env("gcp") == "1" + label: test_skyserve_auto_restart on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source - --aws - if: build.env.aws == '1' - label: TestStorageWithCredentials::test_list_source on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + if: build.env("aws") == "1" + label: test_aws_disk_tier on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp - if: build.env.gcp == '1' - label: test_core_api_sky_launch_exec on gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + --aws + if: build.env("aws") == "1" + label: test_managed_jobs_recovery_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws - if: build.env.aws == '1' - label: test_aws_image_id_dict_zone on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - if: build.env.gcp == '1' - label: test_gcp_zone on gcp + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + if: build.env("gcp") == "1" + label: test_gcp_image_id_dict on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account --gcp - if: build.env.gcp == '1' - label: test_managed_jobs_recovery_gcp on gcp + if: build.env("gcp") == "1" + label: test_gcp_region_and_service_account on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure - if: build.env.azure == '1' - label: test_azure_start_stop on azure + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + --aws + if: build.env("aws") == "1" + label: test_skyserve_user_bug_restart on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail --aws - if: build.env.aws == '1' - label: test_managed_jobs_recovery_default_resources on aws + if: build.env("aws") == "1" + label: test_skyserve_readiness_timeout_fail on aws From 115af3065dc8eccf87b032dab1e7714f24963e93 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Fri, 29 Nov 2024 23:38:51 +0800 Subject: [PATCH 45/64] exclude lambda cloud --- .buildkite/generate_pipeline.py | 3 +- .../pipeline_smoke_tests_pre_merge.yaml | 12 +- .buildkite/pipeline_smoke_tests_release.yaml | 862 +++++++++--------- 3 files changed, 434 insertions(+), 443 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 3c3b9c41edf..baf72d09726 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -12,7 +12,8 @@ ALL_CLOUDS_IN_SMOKE_TESTS = [ 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', - 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod' + 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod', + 'lambda_cloud' ] QUEUE_GENERIC_CLOUD = 'generic_cloud' QUEUE_KUBERNETES = 'kubernetes' diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml index 35ba7ea17ec..d3e992a3189 100644 --- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml +++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml @@ -3,15 +3,15 @@ env: LOG_TO_STDOUT: '1' PYTHONPATH: ${PYTHONPATH}:$(pwd) steps: -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --aws - if: build.env("aws") == "1" - label: test_yaml_launch_and_mount on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount --azure if: build.env("azure") == "1" label: test_yaml_launch_and_mount on azure +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount + --aws + if: build.env("aws") == "1" + label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml index fb22f52afec..c173e2ae0ca 100644 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ b/.buildkite/pipeline_smoke_tests_release.yaml @@ -9,236 +9,199 @@ env: steps: - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_spot_recovery on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - if: build.env("azure") == "1" - label: test_azure_storage_mounts_with_stop on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_list_source on aws + label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions --aws if: build.env("aws") == "1" - label: test_managed_jobs_cancellation_aws on aws + label: TestStorageWithCredentials::test_gcs_regions on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + --aws if: build.env("aws") == "1" - label: test_autostop on aws + label: test_job_queue_with_docker on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + --aws if: build.env("aws") == "1" - label: test_cli_logs on aws + label: test_skyserve_rolling_update on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account + --gcp if: build.env("gcp") == "1" - label: test_gcp_disk_tier on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_custom_image on kubernetes + label: test_gcp_region_and_service_account on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws + --aws if: build.env("aws") == "1" - label: test_launch_fast on aws + label: test_managed_jobs_recovery_multi_node_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure - if: build.env("azure") == "1" - label: test_skyserve_azure_http on azure + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover + --gcp + if: build.env("gcp") == "1" + label: test_gcp_zero_quota_failover on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws --aws if: build.env("aws") == "1" - label: test_managed_jobs_storage on aws + label: test_managed_jobs_pipeline_recovery_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_aws_regions on aws + label: test_docker_preinstalled_package on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_nonexistent_bucket on aws + label: test_clone_disk_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail --aws if: build.env("aws") == "1" - label: test_long_setup_run_script on aws + label: test_skyserve_readiness_timeout_fail on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_private_bucket on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp - if: build.env("gcp") == "1" - label: test_gcp_http_server_with_custom_ports on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws - if: build.env("aws") == "1" - label: test_lambda_job_queue on aws + label: test_using_file_mounts_with_env_vars on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws if: build.env("aws") == "1" - label: test_clone_disk_aws on aws + label: test_huggingface on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup --aws if: build.env("aws") == "1" - label: test_aws_with_ssh_proxy_command on aws + label: test_managed_jobs_failed_setup on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts --aws if: build.env("aws") == "1" - label: test_launch_fast_with_autostop on aws + label: test_docker_storage_mounts on aws - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - if: build.env("aws") == "1" - label: test_cancel_pytorch on aws + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes + --kubernetes + if: build.env("kubernetes") == "1" + label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + --aws if: build.env("aws") == "1" - label: test_inline_env on aws + label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - if: build.env("gcp") == "1" - label: test_skyserve_gcp_http on gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env + --aws + if: build.env("aws") == "1" + label: test_managed_jobs_inline_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback --aws if: build.env("aws") == "1" - label: test_fast_large_job_queue on aws + label: test_skyserve_base_ondemand_fallback on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws if: build.env("aws") == "1" - label: test_file_mounts on aws + label: test_skyserve_failures on aws - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - if: build.env("azure") == "1" - label: test_azure_region on azure + queue: kubernetes + command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_context_failover on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws + command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on - aws + label: test_aws_images on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery --gcp if: build.env("gcp") == "1" - label: test_managed_jobs_pipeline_recovery_gcp on gcp + label: test_skyserve_spot_recovery on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env - --aws + command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws if: build.env("aws") == "1" - label: test_managed_jobs_inline_env on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - if: build.env("azure") == "1" - label: test_azure_images on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - if: build.env("gcp") == "1" - label: test_gcp_zone on gcp + label: test_stale_job on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image --kubernetes if: build.env("kubernetes") == "1" - label: test_kubernetes_context_failover on kubernetes + label: test_kubernetes_custom_image on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + --aws if: build.env("aws") == "1" - label: test_task_labels_aws on aws + label: test_custom_default_conda_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws - if: build.env("aws") == "1" - label: test_job_pipeline on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp + if: build.env("gcp") == "1" + label: test_gcp_image_id_dict_zone on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws - --aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips + --gcp + if: build.env("gcp") == "1" + label: test_gcp_force_enable_external_ips on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws if: build.env("aws") == "1" - label: test_managed_jobs_recovery_multi_node_aws on aws + label: test_multi_node_failure on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws if: build.env("aws") == "1" - label: test_aws_image_id_dict on aws + label: test_core_api_sky_launch_fast on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + --aws if: build.env("aws") == "1" - label: test_skyserve_llm on aws + label: test_skyserve_load_balancer on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports --azure if: build.env("azure") == "1" - label: test_azure_best_tier_failover on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - if: build.env("aws") == "1" - label: test_inferentia on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes - if: build.env("kubernetes") == "1" - label: test_tpu_pod_slice_gke on kubernetes + label: test_azure_http_server_with_custom_ports on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup - --aws + command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws if: build.env("aws") == "1" - label: test_managed_jobs_failed_setup on aws + label: test_cli_logs on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region + --gcp if: build.env("gcp") == "1" - label: test_clone_disk_gcp on gcp + label: test_gcp_image_id_dict_region on gcp - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws @@ -246,122 +209,127 @@ steps: label: test_large_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp if: build.env("gcp") == "1" - label: test_cancel_gcp on gcp + label: test_tpu on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop --aws if: build.env("aws") == "1" - label: test_aws_http_server_with_custom_ports on aws + label: test_aws_storage_mounts_with_stop on aws - agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - if: build.env("gcp") == "1" - label: test_task_labels_gcp on gcp + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_storage_mounts on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - if: build.env("gcp") == "1" - label: test_gcp_images on gcp + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop + --aws + if: build.env("aws") == "1" + label: test_launch_fast_with_autostop on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp + --gcp if: build.env("gcp") == "1" - label: test_managed_jobs_tpu on gcp + label: test_managed_jobs_recovery_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws if: build.env("aws") == "1" - label: test_multi_node_failure on aws + label: test_skyserve_cancel on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws if: build.env("aws") == "1" - label: test_docker_storage_mounts on aws + label: test_aws_zone on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - if: build.env("aws") == "1" - label: test_aws_custom_image on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart + --gcp if: build.env("gcp") == "1" - label: test_tpu_vm_pod on gcp + label: test_skyserve_auto_restart on gcp - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch --kubernetes if: build.env("kubernetes") == "1" - label: test_kubernetes_storage_mounts on kubernetes + label: test_add_pod_annotations_for_autodown_with_launch on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue + --aws if: build.env("aws") == "1" - label: test_aws_region on aws + label: test_fast_large_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws if: build.env("aws") == "1" - label: test_use_spot on aws + label: test_managed_jobs on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + --aws if: build.env("aws") == "1" - label: test_example_app on aws + label: test_aws_image_id_dict_region on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources + --aws if: build.env("aws") == "1" - label: test_cancel_aws on aws + label: test_managed_jobs_recovery_default_resources on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws + command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp + if: build.env("gcp") == "1" + label: test_gcp_zone on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws if: build.env("aws") == "1" - label: test_skyserve_aws_http on aws + label: test_launch_fast on aws - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_container_logs_two_jobs_kubernetes on kubernetes + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_cancellation_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_public_bucket on aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_pipeline_recovery_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket + --aws if: build.env("aws") == "1" - label: test_distributed_tf on aws + label: TestStorageWithCredentials::test_private_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws + label: test_job_queue on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp - if: build.env("gcp") == "1" - label: test_core_api_sky_launch_exec on gcp + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws + if: build.env("aws") == "1" + label: test_aws_image_id_dict on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws if: build.env("aws") == "1" - label: test_multiple_resources on aws + label: test_skyserve_update on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http --kubernetes if: build.env("kubernetes") == "1" - label: test_task_labels_kubernetes on kubernetes + label: test_skyserve_kubernetes_http on kubernetes - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update @@ -370,394 +338,366 @@ steps: label: test_skyserve_new_autoscaler_update on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - if: build.env("azure") == "1" - label: test_azure_disk_tier on azure + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions + --aws + if: build.env("aws") == "1" + label: TestStorageWithCredentials::test_aws_regions on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes --kubernetes if: build.env("kubernetes") == "1" - label: test_kubernetes_context_switch on kubernetes + label: test_task_labels_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws - if: build.env("aws") == "1" - label: test_skyserve_update on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp + if: build.env("gcp") == "1" + label: test_gcp_disk_tier on gcp +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_http_server_with_custom_ports on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package - --aws - if: build.env("aws") == "1" - label: test_docker_preinstalled_package on aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback + --gcp + if: build.env("gcp") == "1" + label: test_skyserve_dynamic_ondemand_fallback on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_gcs_regions on aws + label: test_inline_env on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_tpu on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion --aws if: build.env("aws") == "1" - label: test_skyserve_large_readiness_timeout on aws + label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on + aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode + command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent --aws if: build.env("aws") == "1" - label: test_job_queue_multinode on aws + label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws - if: build.env("aws") == "1" - label: test_skyserve_streaming on aws + command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure + if: build.env("azure") == "1" + label: test_azure_images on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp if: build.env("gcp") == "1" - label: test_tpu on gcp + label: test_tpu_vm on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws --aws if: build.env("aws") == "1" - label: test_multiple_accelerators_unordered on aws + label: test_managed_jobs_recovery_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp + command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart --gcp if: build.env("gcp") == "1" - label: test_managed_jobs_cancellation_gcp on gcp + label: test_gcp_stale_job_manual_restart on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default - --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws if: build.env("aws") == "1" - label: test_multiple_accelerators_ordered_with_default on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region - --gcp - if: build.env("gcp") == "1" - label: test_gcp_image_id_dict_region on gcp + label: test_use_spot on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure if: build.env("azure") == "1" - label: test_cancel_azure on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp - if: build.env("gcp") == "1" - label: test_gcp_force_enable_external_ips on gcp + label: test_azure_start_stop on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws if: build.env("aws") == "1" - label: test_core_api_sky_launch_fast on aws + label: test_autodown on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure - if: build.env("azure") == "1" - label: test_azure_start_stop_two_nodes on azure + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp + if: build.env("gcp") == "1" + label: test_cancel_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws if: build.env("aws") == "1" - label: test_aws_zone on aws + label: test_user_dependencies on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover - --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws if: build.env("aws") == "1" - label: test_aws_zero_quota_failover on aws + label: test_autostop on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars + command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode --aws if: build.env("aws") == "1" - label: test_using_file_mounts_with_env_vars on aws + label: test_job_queue_multinode on aws +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch + --kubernetes + if: build.env("kubernetes") == "1" + label: test_kubernetes_context_switch on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure + if: build.env("azure") == "1" + label: test_cancel_azure on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws --aws if: build.env("aws") == "1" - label: test_multiple_accelerators_unordered_with_default on aws + label: test_managed_jobs_cancellation_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered - --aws + command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws if: build.env("aws") == "1" - label: test_multiple_accelerators_ordered on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes + label: test_sky_bench on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws if: build.env("aws") == "1" - label: test_minimal on aws + label: test_skyserve_aws_http on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp if: build.env("gcp") == "1" - label: test_gcp_storage_mounts_with_stop on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - if: build.env("aws") == "1" - label: test_job_queue on aws + label: test_gcp_start_stop on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws if: build.env("aws") == "1" - label: test_aws_images on aws + label: test_multi_hostname on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws + label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source + on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_bucket_external_deletion on aws + label: test_multiple_accelerators_ordered_with_default on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage --aws if: build.env("aws") == "1" - label: test_managed_jobs_pipeline_recovery_aws on aws + label: test_managed_jobs_storage on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - if: build.env("aws") == "1" - label: test_image_no_conda on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - if: build.env("aws") == "1" - label: test_user_dependencies on aws + command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp + if: build.env("gcp") == "1" + label: test_gcp_mig on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered --aws if: build.env("aws") == "1" - label: test_aws_stale_job_manual_restart on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover - --gcp - if: build.env("gcp") == "1" - label: test_gcp_zero_quota_failover on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp - if: build.env("gcp") == "1" - label: test_gcp_image_id_dict_zone on gcp + label: test_multiple_accelerators_ordered on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure if: build.env("azure") == "1" - label: test_azure_start_stop on azure + label: test_azure_disk_tier on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws + command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws if: build.env("aws") == "1" - label: test_aws_image_id_dict_zone on aws + label: test_env_check on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale --aws if: build.env("aws") == "1" - label: test_aws_image_id_dict_region on aws + label: test_skyserve_update_autoscale on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws if: build.env("aws") == "1" - label: test_inline_env_file on aws + label: test_cancel_aws on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws if: build.env("aws") == "1" - label: test_managed_jobs on aws + label: test_inferentia on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_invalid_names on aws + label: test_aws_stale_job_manual_restart on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup - --aws + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws if: build.env("aws") == "1" - label: test_managed_jobs_pipeline_failed_setup on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes - if: build.env("kubernetes") == "1" - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes + label: test_aws_region on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws + command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws if: build.env("aws") == "1" - label: test_stale_job on aws + label: test_job_pipeline on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update + command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + if: build.env("gcp") == "1" + label: test_gcp_image_id_dict on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage --aws if: build.env("aws") == "1" - label: test_skyserve_rolling_update on aws + label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws if: build.env("aws") == "1" - label: test_autodown on aws + label: test_distributed_tf on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - if: build.env("gcp") == "1" - label: test_tpu_vm on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes + --azure + if: build.env("azure") == "1" + label: test_azure_start_stop_two_nodes on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources - --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws if: build.env("aws") == "1" - label: test_managed_jobs_recovery_default_resources on aws + label: test_multi_echo on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws - if: build.env("aws") == "1" - label: test_lambda_huggingface on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop + --azure + if: build.env("azure") == "1" + label: test_azure_storage_mounts_with_stop on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source --aws if: build.env("aws") == "1" - label: test_job_queue_with_docker on aws + label: TestStorageWithCredentials::test_list_source on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws - if: build.env("aws") == "1" - label: test_skyserve_failures on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports + --gcp + if: build.env("gcp") == "1" + label: test_gcp_http_server_with_custom_ports on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp + if: build.env("gcp") == "1" + label: test_skyserve_gcp_http on gcp +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket --aws if: build.env("aws") == "1" - label: test_skyserve_base_ondemand_fallback on aws + label: TestStorageWithCredentials::test_public_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - if: build.env("aws") == "1" - label: test_sky_bench on aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp + if: build.env("gcp") == "1" + label: test_task_labels_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws + command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws if: build.env("aws") == "1" - label: test_skyserve_cancel on aws + label: test_aws_image_id_dict_zone on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_dynamic_ondemand_fallback on gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover + --azure + if: build.env("azure") == "1" + label: test_azure_best_tier_failover on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp - --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_recovery_multi_node_gcp on gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws + if: build.env("aws") == "1" + label: test_skyserve_llm on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --aws + command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws if: build.env("aws") == "1" - label: test_skyserve_update_autoscale on aws + label: test_example_app on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names + --aws if: build.env("aws") == "1" - label: test_env_check on aws + label: TestStorageWithCredentials::test_invalid_names on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + --aws if: build.env("aws") == "1" - label: test_skyserve_fast_update on aws + label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws + label: test_aws_zero_quota_failover on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws + label: test_skyserve_user_bug_restart on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws + command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws if: build.env("aws") == "1" - label: test_multi_hostname on aws + label: test_cancel_pytorch on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws + command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws if: build.env("aws") == "1" - label: test_multi_echo on aws + label: test_minimal on aws - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_http_server_with_custom_ports on kubernetes + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws + if: build.env("aws") == "1" + label: test_inline_env_file on aws - agents: queue: kubernetes - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http + command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop --kubernetes if: build.env("kubernetes") == "1" - label: test_skyserve_kubernetes_http on kubernetes + label: test_add_and_remove_pod_annotations_with_autostop on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws + command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws if: build.env("aws") == "1" - label: test_huggingface on aws + label: test_image_no_conda on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion --aws if: build.env("aws") == "1" - label: test_skyserve_load_balancer on aws + label: TestStorageWithCredentials::test_bucket_external_deletion on aws - agents: queue: generic_cloud command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp if: build.env("gcp") == "1" label: test_stop_gcp_spot on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws - agents: queue: kubernetes command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes @@ -766,100 +706,150 @@ steps: label: test_container_logs_multinode_kubernetes on kubernetes - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent - --aws - if: build.env("aws") == "1" - label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop + --gcp + if: build.env("gcp") == "1" + label: test_gcp_storage_mounts_with_stop on gcp - agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch - --kubernetes - if: build.env("kubernetes") == "1" - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws + if: build.env("aws") == "1" + label: test_multiple_resources on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports - --azure - if: build.env("azure") == "1" - label: test_azure_http_server_with_custom_ports on azure + command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws + if: build.env("aws") == "1" + label: test_file_mounts on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp - --gcp + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp if: build.env("gcp") == "1" - label: test_managed_jobs_recovery_gcp on gcp + label: test_tpu_vm_pod on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy --aws if: build.env("aws") == "1" - label: test_aws_storage_mounts_with_stop on aws + label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - if: build.env("gcp") == "1" - label: test_gcp_start_stop on gcp + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup + --aws + if: build.env("aws") == "1" + label: test_managed_jobs_pipeline_failed_setup on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports --aws if: build.env("aws") == "1" - label: test_custom_default_conda_env on aws + label: test_aws_http_server_with_custom_ports on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart - --gcp + command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp if: build.env("gcp") == "1" - label: test_gcp_stale_job_manual_restart on gcp + label: test_gcp_images on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - if: build.env("gcp") == "1" - label: test_gcp_mig on gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws + if: build.env("aws") == "1" + label: test_skyserve_streaming on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_auto_restart on gcp + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure + if: build.env("azure") == "1" + label: test_skyserve_azure_http on azure - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces + --aws if: build.env("aws") == "1" - label: test_aws_disk_tier on aws + label: TestStorageWithCredentials::test_upload_source_with_spaces on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws + command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket --aws if: build.env("aws") == "1" - label: test_managed_jobs_recovery_aws on aws + label: TestStorageWithCredentials::test_nonexistent_bucket on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws + if: build.env("aws") == "1" + label: test_skyserve_fast_update on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws + if: build.env("aws") == "1" + label: test_task_labels_aws on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered --aws if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws + label: test_multiple_accelerators_unordered on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure + if: build.env("azure") == "1" + label: test_azure_region on azure +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes + --kubernetes + if: build.env("kubernetes") == "1" + label: test_container_logs_two_jobs_kubernetes on kubernetes +- agents: + queue: kubernetes + command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes + if: build.env("kubernetes") == "1" + label: test_tpu_pod_slice_gke on kubernetes +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp if: build.env("gcp") == "1" - label: test_gcp_image_id_dict on gcp + label: test_core_api_sky_launch_exec on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp + command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command + --aws + if: build.env("aws") == "1" + label: test_aws_with_ssh_proxy_command on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws + if: build.env("aws") == "1" + label: test_aws_custom_image on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script + --aws + if: build.env("aws") == "1" + label: test_long_setup_run_script on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp if: build.env("gcp") == "1" - label: test_gcp_region_and_service_account on gcp + label: test_clone_disk_gcp on gcp - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart + command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default --aws if: build.env("aws") == "1" - label: test_skyserve_user_bug_restart on aws + label: test_multiple_accelerators_unordered_with_default on aws - agents: queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail + command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws + if: build.env("aws") == "1" + label: test_aws_disk_tier on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout --aws if: build.env("aws") == "1" - label: test_skyserve_readiness_timeout_fail on aws + label: test_skyserve_large_readiness_timeout on aws +- agents: + queue: generic_cloud + command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp + --gcp + if: build.env("gcp") == "1" + label: test_managed_jobs_recovery_multi_node_gcp on gcp From 0c7bfd503f98206895b9a4357a678801154be1d8 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 30 Nov 2024 12:48:55 +0800 Subject: [PATCH 46/64] dynamic generate pipeline --- .buildkite/generate_pipeline.py | 28 +- .../pipeline_smoke_tests_pre_merge.yaml | 17 - .buildkite/pipeline_smoke_tests_release.yaml | 855 ------------------ .pre-commit-config.yaml | 2 +- 4 files changed, 25 insertions(+), 877 deletions(-) delete mode 100644 .buildkite/pipeline_smoke_tests_pre_merge.yaml delete mode 100644 .buildkite/pipeline_smoke_tests_release.yaml diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index baf72d09726..bd01bcf6bb4 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -1,4 +1,24 @@ -"""This script generates a Buildkite pipeline from test files.""" +""" +This script generates a Buildkite pipeline from test files. + +The script will generate two pipelines: + +tests/smoke_tests +├── test_*.py -> release pipeline +├── test_required_before_merge.py -> pre-merge pipeline + +1. release pipeline, which runs all smoke tests by default, some function + support tests by multiple clouds, but we only generate one cloud per test + function to save cost. +2. pre-merge pipeline, which generates all clouds supported by the test + function, author should specify which clouds to run by setting env in the + step. + +We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP) now, +smoke tests for those clouds are generated, other clouds are not supported +yet, smoke tests for those clouds are not generated. +""" + import ast from collections import defaultdict import copy @@ -32,7 +52,7 @@ 'edit directly.\n' ) - + def _get_full_decorator_path(decorator: ast.AST) -> str: """Recursively get the full path of a decorator.""" if isinstance(decorator, ast.Attribute): @@ -169,8 +189,8 @@ def _convert_pre_merge(test_files: List[str]): output_file_pipelines_map = defaultdict(list) for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') - # We want enable all clouds by default for each test function - # for pre-merge. And let the author controls which clouds + # We want enable all clouds by default for each test function + # for pre-merge. And let the author controls which clouds # to run by parameter. pipeline = _generate_pipeline(test_file, False) output_file_pipelines_map[yaml_file_path].append(pipeline) diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml deleted file mode 100644 index d3e992a3189..00000000000 --- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) -steps: -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --azure - if: build.env("azure") == "1" - label: test_yaml_launch_and_mount on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount - --aws - if: build.env("aws") == "1" - label: test_yaml_launch_and_mount on aws diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml deleted file mode 100644 index c173e2ae0ca..00000000000 --- a/.buildkite/pipeline_smoke_tests_release.yaml +++ /dev/null @@ -1,855 +0,0 @@ -# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly. -env: - LOG_TO_STDOUT: '1' - PYTHONPATH: ${PYTHONPATH}:$(pwd) - aws: '1' - azure: '1' - gcp: '1' - kubernetes: '1' -steps: -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_gcs_regions on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker - --aws - if: build.env("aws") == "1" - label: test_job_queue_with_docker on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update - --aws - if: build.env("aws") == "1" - label: test_skyserve_rolling_update on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account - --gcp - if: build.env("gcp") == "1" - label: test_gcp_region_and_service_account on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_recovery_multi_node_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover - --gcp - if: build.env("gcp") == "1" - label: test_gcp_zero_quota_failover on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_pipeline_recovery_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package - --aws - if: build.env("aws") == "1" - label: test_docker_preinstalled_package on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws - if: build.env("aws") == "1" - label: test_clone_disk_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail - --aws - if: build.env("aws") == "1" - label: test_skyserve_readiness_timeout_fail on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars - --aws - if: build.env("aws") == "1" - label: test_using_file_mounts_with_env_vars on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws - if: build.env("aws") == "1" - label: test_huggingface on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_failed_setup on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts - --aws - if: build.env("aws") == "1" - label: test_docker_storage_mounts on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_inline_env on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback - --aws - if: build.env("aws") == "1" - label: test_skyserve_base_ondemand_fallback on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws - if: build.env("aws") == "1" - label: test_skyserve_failures on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_context_failover on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws - if: build.env("aws") == "1" - label: test_aws_images on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_spot_recovery on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws - if: build.env("aws") == "1" - label: test_stale_job on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_custom_image on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env - --aws - if: build.env("aws") == "1" - label: test_custom_default_conda_env on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp - if: build.env("gcp") == "1" - label: test_gcp_image_id_dict_zone on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips - --gcp - if: build.env("gcp") == "1" - label: test_gcp_force_enable_external_ips on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws - if: build.env("aws") == "1" - label: test_multi_node_failure on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws - if: build.env("aws") == "1" - label: test_core_api_sky_launch_fast on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer - --aws - if: build.env("aws") == "1" - label: test_skyserve_load_balancer on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports - --azure - if: build.env("azure") == "1" - label: test_azure_http_server_with_custom_ports on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws - if: build.env("aws") == "1" - label: test_cli_logs on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region - --gcp - if: build.env("gcp") == "1" - label: test_gcp_image_id_dict_region on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws - if: build.env("aws") == "1" - label: test_large_job_queue on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp - if: build.env("gcp") == "1" - label: test_tpu on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop - --aws - if: build.env("aws") == "1" - label: test_aws_storage_mounts_with_stop on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_storage_mounts on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop - --aws - if: build.env("aws") == "1" - label: test_launch_fast_with_autostop on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp - --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_recovery_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws - if: build.env("aws") == "1" - label: test_skyserve_cancel on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws - if: build.env("aws") == "1" - label: test_aws_zone on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_auto_restart on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch - --kubernetes - if: build.env("kubernetes") == "1" - label: test_add_pod_annotations_for_autodown_with_launch on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue - --aws - if: build.env("aws") == "1" - label: test_fast_large_job_queue on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws - if: build.env("aws") == "1" - label: test_managed_jobs on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region - --aws - if: build.env("aws") == "1" - label: test_aws_image_id_dict_region on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_recovery_default_resources on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp - if: build.env("gcp") == "1" - label: test_gcp_zone on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws - if: build.env("aws") == "1" - label: test_launch_fast on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp - --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_cancellation_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp - --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_pipeline_recovery_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_private_bucket on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws - if: build.env("aws") == "1" - label: test_job_queue on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws - if: build.env("aws") == "1" - label: test_aws_image_id_dict on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws - if: build.env("aws") == "1" - label: test_skyserve_update on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http - --kubernetes - if: build.env("kubernetes") == "1" - label: test_skyserve_kubernetes_http on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update - --aws - if: build.env("aws") == "1" - label: test_skyserve_new_autoscaler_update on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_aws_regions on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_task_labels_kubernetes on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp - if: build.env("gcp") == "1" - label: test_gcp_disk_tier on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_http_server_with_custom_ports on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback - --gcp - if: build.env("gcp") == "1" - label: test_skyserve_dynamic_ondemand_fallback on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws - if: build.env("aws") == "1" - label: test_inline_env on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_tpu on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on - aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent - --aws - if: build.env("aws") == "1" - label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure - if: build.env("azure") == "1" - label: test_azure_images on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp - if: build.env("gcp") == "1" - label: test_tpu_vm on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_recovery_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart - --gcp - if: build.env("gcp") == "1" - label: test_gcp_stale_job_manual_restart on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws - if: build.env("aws") == "1" - label: test_use_spot on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure - if: build.env("azure") == "1" - label: test_azure_start_stop on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws - if: build.env("aws") == "1" - label: test_autodown on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp - if: build.env("gcp") == "1" - label: test_cancel_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws - if: build.env("aws") == "1" - label: test_user_dependencies on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws - if: build.env("aws") == "1" - label: test_autostop on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode - --aws - if: build.env("aws") == "1" - label: test_job_queue_multinode on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch - --kubernetes - if: build.env("kubernetes") == "1" - label: test_kubernetes_context_switch on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure - if: build.env("azure") == "1" - label: test_cancel_azure on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_cancellation_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws - if: build.env("aws") == "1" - label: test_sky_bench on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws - if: build.env("aws") == "1" - label: test_skyserve_aws_http on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp - if: build.env("gcp") == "1" - label: test_gcp_start_stop on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws - if: build.env("aws") == "1" - label: test_multi_hostname on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source - on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default - --aws - if: build.env("aws") == "1" - label: test_multiple_accelerators_ordered_with_default on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_storage on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp - if: build.env("gcp") == "1" - label: test_gcp_mig on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered - --aws - if: build.env("aws") == "1" - label: test_multiple_accelerators_ordered on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure - if: build.env("azure") == "1" - label: test_azure_disk_tier on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws - if: build.env("aws") == "1" - label: test_env_check on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale - --aws - if: build.env("aws") == "1" - label: test_skyserve_update_autoscale on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws - if: build.env("aws") == "1" - label: test_cancel_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws - if: build.env("aws") == "1" - label: test_inferentia on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart - --aws - if: build.env("aws") == "1" - label: test_aws_stale_job_manual_restart on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws - if: build.env("aws") == "1" - label: test_aws_region on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws - if: build.env("aws") == "1" - label: test_job_pipeline on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp - if: build.env("gcp") == "1" - label: test_gcp_image_id_dict on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws - if: build.env("aws") == "1" - label: test_distributed_tf on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes - --azure - if: build.env("azure") == "1" - label: test_azure_start_stop_two_nodes on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws - if: build.env("aws") == "1" - label: test_multi_echo on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop - --azure - if: build.env("azure") == "1" - label: test_azure_storage_mounts_with_stop on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_list_source on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports - --gcp - if: build.env("gcp") == "1" - label: test_gcp_http_server_with_custom_ports on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp - if: build.env("gcp") == "1" - label: test_skyserve_gcp_http on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_public_bucket on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp - if: build.env("gcp") == "1" - label: test_task_labels_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws - if: build.env("aws") == "1" - label: test_aws_image_id_dict_zone on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover - --azure - if: build.env("azure") == "1" - label: test_azure_best_tier_failover on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws - if: build.env("aws") == "1" - label: test_skyserve_llm on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws - if: build.env("aws") == "1" - label: test_example_app on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_invalid_names on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover - --aws - if: build.env("aws") == "1" - label: test_aws_zero_quota_failover on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart - --aws - if: build.env("aws") == "1" - label: test_skyserve_user_bug_restart on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws - if: build.env("aws") == "1" - label: test_cancel_pytorch on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws - if: build.env("aws") == "1" - label: test_minimal on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws - if: build.env("aws") == "1" - label: test_inline_env_file on aws -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop - --kubernetes - if: build.env("kubernetes") == "1" - label: test_add_and_remove_pod_annotations_with_autostop on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws - if: build.env("aws") == "1" - label: test_image_no_conda on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_bucket_external_deletion on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp - if: build.env("gcp") == "1" - label: test_stop_gcp_spot on gcp -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_container_logs_multinode_kubernetes on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop - --gcp - if: build.env("gcp") == "1" - label: test_gcp_storage_mounts_with_stop on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws - if: build.env("aws") == "1" - label: test_multiple_resources on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws - if: build.env("aws") == "1" - label: test_file_mounts on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp - if: build.env("gcp") == "1" - label: test_tpu_vm_pod on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy - on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup - --aws - if: build.env("aws") == "1" - label: test_managed_jobs_pipeline_failed_setup on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports - --aws - if: build.env("aws") == "1" - label: test_aws_http_server_with_custom_ports on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp - if: build.env("gcp") == "1" - label: test_gcp_images on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws - if: build.env("aws") == "1" - label: test_skyserve_streaming on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure - if: build.env("azure") == "1" - label: test_skyserve_azure_http on azure -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_upload_source_with_spaces on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket - --aws - if: build.env("aws") == "1" - label: TestStorageWithCredentials::test_nonexistent_bucket on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws - if: build.env("aws") == "1" - label: test_skyserve_fast_update on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws - if: build.env("aws") == "1" - label: test_task_labels_aws on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered - --aws - if: build.env("aws") == "1" - label: test_multiple_accelerators_unordered on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure - if: build.env("azure") == "1" - label: test_azure_region on azure -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes - --kubernetes - if: build.env("kubernetes") == "1" - label: test_container_logs_two_jobs_kubernetes on kubernetes -- agents: - queue: kubernetes - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes - if: build.env("kubernetes") == "1" - label: test_tpu_pod_slice_gke on kubernetes -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp - if: build.env("gcp") == "1" - label: test_core_api_sky_launch_exec on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command - --aws - if: build.env("aws") == "1" - label: test_aws_with_ssh_proxy_command on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws - if: build.env("aws") == "1" - label: test_aws_custom_image on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script - --aws - if: build.env("aws") == "1" - label: test_long_setup_run_script on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp - if: build.env("gcp") == "1" - label: test_clone_disk_gcp on gcp -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default - --aws - if: build.env("aws") == "1" - label: test_multiple_accelerators_unordered_with_default on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws - if: build.env("aws") == "1" - label: test_aws_disk_tier on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout - --aws - if: build.env("aws") == "1" - label: test_skyserve_large_readiness_timeout on aws -- agents: - queue: generic_cloud - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp - --gcp - if: build.env("gcp") == "1" - label: test_managed_jobs_recovery_multi_node_gcp on gcp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db40b03b5fa..25fab5b468a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,7 +56,7 @@ repos: hooks: - id: yapf name: yapf - exclude: (build/.*|sky/skylet/providers/ibm/.*) # Matches exclusions from the script + exclude: (sky/skylet/providers/ibm/.*) # Matches exclusions from the script args: ['--recursive', '--parallel'] # Only necessary flags additional_dependencies: [toml==0.10.2] From b14a655b67311cd730b10fbe1e484c4d7f1587a3 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 30 Nov 2024 13:50:36 +0800 Subject: [PATCH 47/64] fix pre-commit --- .buildkite/generate_pipeline.py | 39 ++++++++++++++------------------- .pre-commit-config.yaml | 2 +- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index bd01bcf6bb4..f0b85263551 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -24,6 +24,7 @@ import copy import os import random + from typing import Any, Dict, List, Optional import yaml @@ -46,11 +47,9 @@ 'kubernetes': QUEUE_KUBERNETES } -GENERATED_FILE_HEAD = ( - '# This is an auto-generated Buildkite pipeline by ' - '.buildkite/generate_pipeline.py, Please do not ' - 'edit directly.\n' -) +GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by ' + '.buildkite/generate_pipeline.py, Please do not ' + 'edit directly.\n') def _get_full_decorator_path(decorator: ast.AST) -> str: @@ -107,8 +106,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: if cloud not in clouds_to_exclude ] final_clouds_to_include = [ - cloud for cloud in clouds_to_include - if cloud in CLOUD_QUEUE_MAP + cloud for cloud in clouds_to_include if cloud in CLOUD_QUEUE_MAP ] if clouds_to_include and not final_clouds_to_include: print(f'Warning: {file_path}:{node.name} ' @@ -122,7 +120,8 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: return function_cloud_map -def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dict[str, Any]: +def _generate_pipeline(test_file: str, + one_cloud_per_test_function: bool) -> Dict[str, Any]: """Generate a Buildkite pipeline from test files.""" steps = [] function_cloud_map = _extract_marked_tests(test_file) @@ -144,13 +143,11 @@ def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dic return {'steps': steps} -def _dump_pipeline_to_file( - output_file_pipelines_map: Dict[str, List[Dict[str, Any]]], - extra_env: Optional[Dict[str, str]] = None): - default_env = { - 'LOG_TO_STDOUT': '1', - 'PYTHONPATH': '${PYTHONPATH}:$(pwd)' - } +def _dump_pipeline_to_file(output_file_pipelines_map: Dict[str, + List[Dict[str, + Any]]], + extra_env: Optional[Dict[str, str]] = None): + default_env = {'LOG_TO_STDOUT': '1', 'PYTHONPATH': '${PYTHONPATH}:$(pwd)'} if extra_env: default_env.update(extra_env) @@ -163,12 +160,10 @@ def _dump_pipeline_to_file( # Shuffle the steps to avoid flakyness, consecutive runs of the same # kind of test may fail for requiring locks on the same resources. random.shuffle(all_steps) - final_pipeline = { - 'steps': all_steps, - 'env': default_env - } + final_pipeline = {'steps': all_steps, 'env': default_env} yaml.dump(final_pipeline, file, default_flow_style=False) + def _convert_release(test_files: List[str]): yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml' output_file_pipelines_map = defaultdict(list) @@ -179,9 +174,8 @@ def _convert_release(test_files: List[str]): output_file_pipelines_map[yaml_file_path].append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') # Enable all clouds by default for release pipeline. - _dump_pipeline_to_file(output_file_pipelines_map, extra_env={ - cloud: '1' for cloud in CLOUD_QUEUE_MAP - }) + _dump_pipeline_to_file(output_file_pipelines_map, + extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP}) def _convert_pre_merge(test_files: List[str]): @@ -197,6 +191,7 @@ def _convert_pre_merge(test_files: List[str]): print(f'Converted {test_file} to {yaml_file_path}\n\n') _dump_pipeline_to_file(output_file_pipelines_map) + def main(): test_files = os.listdir('tests/smoke_tests') release_files = [] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25fab5b468a..a1e6cc4a7dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,7 +57,7 @@ repos: - id: yapf name: yapf exclude: (sky/skylet/providers/ibm/.*) # Matches exclusions from the script - args: ['--recursive', '--parallel'] # Only necessary flags + args: ['--recursive', '--parallel', '--in-place'] # Only necessary flags additional_dependencies: [toml==0.10.2] - repo: https://github.com/pylint-dev/pylint From f4a1b366d9d062f4fb261e2f66028b17b2a81963 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Sat, 30 Nov 2024 19:25:59 +0800 Subject: [PATCH 48/64] format --- .buildkite/generate_pipeline.py | 1 - .pre-commit-config.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index f0b85263551..0c3703e468d 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -24,7 +24,6 @@ import copy import os import random - from typing import Any, Dict, List, Optional import yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a1e6cc4a7dd..81f794dac24 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,6 @@ repos: args: - "--sg=build/**" # Matches "${ISORT_YAPF_EXCLUDES[@]}" - "--sg=sky/skylet/providers/ibm/**" - files: "^(sky|tests|examples|llm|docs)/.*" # Only match these directories # Second isort command - id: isort name: isort (IBM specific) From 60c9290790885fdadfc3503831527ea5130de12d Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 2 Dec 2024 12:17:41 +0800 Subject: [PATCH 49/64] support SUPPRESS_SENSITIVE_LOG --- .buildkite/generate_pipeline.py | 38 +++++++++++++++++++++++++++------ tests/smoke_tests/util.py | 11 ++++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 0c3703e468d..8b2cf65e8b9 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -7,6 +7,10 @@ ├── test_*.py -> release pipeline ├── test_required_before_merge.py -> pre-merge pipeline +run `python .buildkite/generate_pipeline.py` to generate the pipeline for +testing. The CI will run this script as a pre-step, and use the generated +pipeline to run the tests. + 1. release pipeline, which runs all smoke tests by default, some function support tests by multiple clouds, but we only generate one cloud per test function to save cost. @@ -36,7 +40,9 @@ 'lambda_cloud' ] QUEUE_GENERIC_CLOUD = 'generic_cloud' +QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve' QUEUE_KUBERNETES = 'kubernetes' +QUEUE_KUBERNETES_SERVE = 'kubernetes_serve' # Only aws, gcp, azure, and kubernetes are supported for now. # Other clouds do not have credentials. CLOUD_QUEUE_MAP = { @@ -45,6 +51,15 @@ 'azure': QUEUE_GENERIC_CLOUD, 'kubernetes': QUEUE_KUBERNETES } +# Serve tests runs long, and different test steps usually requires locks. +# Its highly likely to fail if multiple serve tests are running concurrently. +# So we use a different queue that runs only one concurrent test at a time. +SERVE_CLOUD_QUEUE_MAP = { + 'aws': QUEUE_GENERIC_CLOUD_SERVE, + 'gcp': QUEUE_GENERIC_CLOUD_SERVE, + 'azure': QUEUE_GENERIC_CLOUD_SERVE, + 'kubernetes': QUEUE_KUBERNETES_SERVE +} GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by ' '.buildkite/generate_pipeline.py, Please do not ' @@ -82,6 +97,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: clouds_to_include = [] clouds_to_exclude = [] + is_serve_test = False for decorator in node.decorator_list: if isinstance(decorator, ast.Call): # We only need to consider the decorator with no arguments @@ -94,6 +110,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: if suffix.startswith('no_'): clouds_to_exclude.append(suffix[3:]) else: + if suffix == 'serve': + is_serve_test = True + continue if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS: # This mark does not specify a cloud, so we skip it. continue @@ -104,8 +123,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: cloud for cloud in clouds_to_include if cloud not in clouds_to_exclude ] + cloud_queue_map = SERVE_CLOUD_QUEUE_MAP if is_serve_test else CLOUD_QUEUE_MAP final_clouds_to_include = [ - cloud for cloud in clouds_to_include if cloud in CLOUD_QUEUE_MAP + cloud for cloud in clouds_to_include if cloud in cloud_queue_map ] if clouds_to_include and not final_clouds_to_include: print(f'Warning: {file_path}:{node.name} ' @@ -115,7 +135,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: continue function_name = (f'{class_name}::{node.name}' if class_name else node.name) - function_cloud_map[function_name] = (clouds_to_include) + function_cloud_map[function_name] = (final_clouds_to_include, [ + cloud_queue_map[cloud] for cloud in final_clouds_to_include + ]) return function_cloud_map @@ -124,15 +146,16 @@ def _generate_pipeline(test_file: str, """Generate a Buildkite pipeline from test files.""" steps = [] function_cloud_map = _extract_marked_tests(test_file) - for test_function, clouds in function_cloud_map.items(): - for cloud in clouds: + for test_function, clouds_and_queues in function_cloud_map.items(): + for cloud, queue in zip(*clouds_and_queues): step = { 'label': f'{test_function} on {cloud}', 'command': f'pytest {test_file}::{test_function} --{cloud}', 'agents': { # Separate agent pool for each cloud. - # Since some are more costly - 'queue': CLOUD_QUEUE_MAP[cloud] + # Since they require different amount of resources and + # concurrency control. + 'queue': queue }, 'if': f'build.env("{cloud}") == "1"' } @@ -188,7 +211,8 @@ def _convert_pre_merge(test_files: List[str]): pipeline = _generate_pipeline(test_file, False) output_file_pipelines_map[yaml_file_path].append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') - _dump_pipeline_to_file(output_file_pipelines_map) + _dump_pipeline_to_file(output_file_pipelines_map, + extra_env={'SUPPRESS_SENSITIVE_LOG': '1'}) def main(): diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index 413b238703c..2675bb0e35a 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -1,5 +1,6 @@ import enum import inspect +import logging import os import subprocess import sys @@ -12,6 +13,7 @@ import sky from sky import serve +from sky import sky_logging from sky.clouds import AWS from sky.clouds import GCP from sky.jobs.state import ManagedJobStatus @@ -57,6 +59,15 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) +# Suppress the sensitive log in smoke tests. +SUPPRESS_SENSITIVE_LOG = os.environ.get('SUPPRESS_SENSITIVE_LOG', None) +if SUPPRESS_SENSITIVE_LOG: + provisioner_logger = sky_logging.init_logger('sky.provisioner') + optimizer_logger = sky_logging.init_logger('sky.optimizer') + # Do not print the debug logs. + provisioner_logger.setLevel(logging.INFO) + optimizer_logger.setLevel(logging.INFO) + def _statuses_to_str(statuses: List[enum.Enum]): """Convert a list of enums to a string with all the values separated by |.""" From b22afff1aab2f2c186423efc362ac734595603ef Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 2 Dec 2024 13:03:30 +0800 Subject: [PATCH 50/64] support env SKYPILOT_SUPPRESS_SENSITIVE_LOG to suppress debug log --- .buildkite/generate_pipeline.py | 2 +- sky/sky_logging.py | 13 +++++++++++++ sky/utils/env_options.py | 1 + tests/smoke_tests/util.py | 9 --------- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 8b2cf65e8b9..6c5a3b0d21e 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -212,7 +212,7 @@ def _convert_pre_merge(test_files: List[str]): output_file_pipelines_map[yaml_file_path].append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') _dump_pipeline_to_file(output_file_pipelines_map, - extra_env={'SUPPRESS_SENSITIVE_LOG': '1'}) + extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'}) def main(): diff --git a/sky/sky_logging.py b/sky/sky_logging.py index 75dc836a49e..f76f5a31b94 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -15,6 +15,7 @@ not env_options.Options.MINIMIZE_LOGGING.get()) _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s' _DATE_FORMAT = '%m-%d %H:%M:%S' +_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer'] class NewLineFormatter(logging.Formatter): @@ -75,6 +76,18 @@ def _setup_logger(): # Setting this will avoid the message # being propagated to the parent logger. _root_logger.propagate = False + if env_options.Options.SUPPRESS_SENSITIVE_LOG.get(): + # If the sensitive log is enabled, we force set the level to INFO + # to suppress the debug logs for certain loggers. + # Do not propagate to the parent logger to avoid parent + # logger printing the logs. + for logger_name in _SENSITIVE_LOGGER: + logger = logging.getLogger(logger_name) + handler_to_logger = RichSafeStreamHandler(sys.stdout) + handler_to_logger.flush = sys.stdout.flush # type: ignore + logger.addHandler(handler_to_logger) + logger.setLevel(logging.INFO) + logger.propagate = False def reload_logger(): diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index ebec8eeb90d..cfc20a76253 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -11,6 +11,7 @@ class Options(enum.Enum): SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False) DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False) MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True) + SUPPRESS_SENSITIVE_LOG = ('SKYPILOT_SUPPRESS_SENSITIVE_LOG', False) # Internal: this is used to skip the cloud user identity check, which is # used to protect cluster operations in a multi-identity scenario. # Currently, this is only used in the job and serve controller, as there diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py index 2675bb0e35a..0c583d828be 100644 --- a/tests/smoke_tests/util.py +++ b/tests/smoke_tests/util.py @@ -59,15 +59,6 @@ _ALL_MANAGED_JOB_STATUSES = "|".join( [status.value for status in ManagedJobStatus]) -# Suppress the sensitive log in smoke tests. -SUPPRESS_SENSITIVE_LOG = os.environ.get('SUPPRESS_SENSITIVE_LOG', None) -if SUPPRESS_SENSITIVE_LOG: - provisioner_logger = sky_logging.init_logger('sky.provisioner') - optimizer_logger = sky_logging.init_logger('sky.optimizer') - # Do not print the debug logs. - provisioner_logger.setLevel(logging.INFO) - optimizer_logger.setLevel(logging.INFO) - def _statuses_to_str(statuses: List[enum.Enum]): """Convert a list of enums to a string with all the values separated by |.""" From def4eb7ba17ef1acec43681fc495736c32801ecb Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Mon, 2 Dec 2024 13:21:20 +0800 Subject: [PATCH 51/64] support env SKYPILOT_SUPPRESS_SENSITIVE_LOG to suppress debug log --- sky/sky_logging.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sky/sky_logging.py b/sky/sky_logging.py index f76f5a31b94..944cbcf46d4 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -77,16 +77,21 @@ def _setup_logger(): # being propagated to the parent logger. _root_logger.propagate = False if env_options.Options.SUPPRESS_SENSITIVE_LOG.get(): - # If the sensitive log is enabled, we force set the level to INFO - # to suppress the debug logs for certain loggers. - # Do not propagate to the parent logger to avoid parent - # logger printing the logs. + # If the sensitive log is enabled, we re init a new handler + # and force set the level to INFO to suppress the debug logs + # for certain loggers. for logger_name in _SENSITIVE_LOGGER: logger = logging.getLogger(logger_name) handler_to_logger = RichSafeStreamHandler(sys.stdout) handler_to_logger.flush = sys.stdout.flush # type: ignore logger.addHandler(handler_to_logger) logger.setLevel(logging.INFO) + if _show_logging_prefix: + handler_to_logger.setFormatter(FORMATTER) + else: + handler_to_logger.setFormatter(NO_PREFIX_FORMATTER) + # Do not propagate to the parent logger to avoid parent + # logger printing the logs. logger.propagate = False From bef1cf1c580093914720d80a9747fbc8c88027e3 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 4 Dec 2024 16:57:15 +0800 Subject: [PATCH 52/64] add backward_compatibility_tests to pipeline --- .buildkite/generate_pipeline.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 6c5a3b0d21e..f2edae5dfca 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -209,6 +209,14 @@ def _convert_pre_merge(test_files: List[str]): # for pre-merge. And let the author controls which clouds # to run by parameter. pipeline = _generate_pipeline(test_file, False) + pipeline['steps'].append({ + 'label': 'Backward compatibility test', + 'command': 'bash tests/backward_compatibility_tests.sh', + 'agents': { + 'queue': 'back_compat' + }, + 'if': 'build.env("aws") == "1"' + }) output_file_pipelines_map[yaml_file_path].append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') _dump_pipeline_to_file(output_file_pipelines_map, From cd64c4c4a43b65fd54a8d5ac1eabf575c2dae02c Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 4 Dec 2024 17:41:21 +0800 Subject: [PATCH 53/64] pip install uv for backward compatibility test --- tests/backward_compatibility_tests.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 696b87ff6ad..511b2c9ba6b 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -35,7 +35,8 @@ rm -r ~/.sky/wheels || true cd ../sky-master git pull origin master pip uninstall -y skypilot -pip install -e ".[all]" +pip install uv +uv pip install -e ".[all]" cd - conda env list | grep sky-back-compat-current || conda create -n sky-back-compat-current -y python=3.9 @@ -43,7 +44,8 @@ conda activate sky-back-compat-current conda install -c conda-forge google-cloud-sdk -y rm -r ~/.sky/wheels || true pip uninstall -y skypilot -pip install -e ".[all]" +pip install uv +uv pip install -e ".[all]" # exec + launch From cd4d6e13a787d9e4449516119bcdf18c011d49fa Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 5 Dec 2024 14:20:43 +0800 Subject: [PATCH 54/64] import style --- tests/smoke_tests/__init__.py | 2 +- .../{util.py => smoke_tests_utils.py} | 0 tests/smoke_tests/test_basic.py | 145 +++--- tests/smoke_tests/test_cluster_job.py | 430 +++++++++--------- tests/smoke_tests/test_images.py | 110 +++-- tests/smoke_tests/test_managed_job.py | 319 +++++++------ tests/smoke_tests/test_mount_and_storage.py | 97 ++-- tests/smoke_tests/test_region_and_zone.py | 57 ++- .../smoke_tests/test_required_before_merge.py | 13 +- tests/smoke_tests/test_sky_serve.py | 102 ++--- 10 files changed, 634 insertions(+), 641 deletions(-) rename tests/smoke_tests/{util.py => smoke_tests_utils.py} (100%) diff --git a/tests/smoke_tests/__init__.py b/tests/smoke_tests/__init__.py index 7f91740c201..63d4cd2b811 100644 --- a/tests/smoke_tests/__init__.py +++ b/tests/smoke_tests/__init__.py @@ -1,2 +1,2 @@ """For smoke tests import.""" -__all__ = ['util'] +__all__ = ['smoke_tests_utils'] diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/smoke_tests_utils.py similarity index 100% rename from tests/smoke_tests/util.py rename to tests/smoke_tests/smoke_tests_utils.py diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index e5ab315434b..e8dffe53846 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -26,16 +26,7 @@ import time import pytest -from smoke_tests.util import BUMP_UP_SECONDS -from smoke_tests.util import get_cluster_name -from smoke_tests.util import get_cmd_wait_until_cluster_status_contains -from smoke_tests.util import ( - get_cmd_wait_until_job_status_contains_without_matching_job) -from smoke_tests.util import get_timeout -from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import Test -from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT +from smoke_tests import smoke_tests_utils import sky from sky.skylet import events @@ -45,25 +36,25 @@ # ---------- Dry run: 2 Tasks in a chain. ---------- @pytest.mark.no_fluidstack #requires GCP and AWS set up def test_example_app(): - test = Test( + test = smoke_tests_utils.Test( 'example_app', ['python examples/example_app.py'], ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- A minimal task ---------- def test_minimal(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'minimal', [ - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', # Output validation done. f'sky logs {name} 1 --status', f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. # Test launch output again on existing cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. # Check the logs downloading @@ -89,20 +80,20 @@ def test_minimal(generic_cloud: str): f'sky exec -c {name} && exit 1 || true', ], f'sky down -y {name}', - get_timeout(generic_cloud), + smoke_tests_utils.get_timeout(generic_cloud), ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Test fast launch ---------- def test_launch_fast(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() - test = Test( + test = smoke_tests_utils.Test( 'test_launch_fast', [ # First launch to create the cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', # Second launch to test fast launch - should not reprovision @@ -118,9 +109,9 @@ def test_launch_fast(generic_cloud: str): f'sky status -r {name} | grep UP', ], f'sky down -y {name}', - timeout=get_timeout(generic_cloud), + timeout=smoke_tests_utils.get_timeout(generic_cloud), ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # See cloud exclusion explanations in test_autostop @@ -129,35 +120,35 @@ def test_launch_fast(generic_cloud: str): @pytest.mark.no_ibm @pytest.mark.no_kubernetes def test_launch_fast_with_autostop(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure # the VM is stopped. autostop_timeout = 600 if generic_cloud == 'azure' else 250 - test = Test( + test = smoke_tests_utils.Test( 'test_launch_fast_with_autostop', [ # First launch to create the cluster with a short autostop - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 1 --status', f'sky status -r {name} | grep UP', # Ensure cluster is stopped - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=autostop_timeout), # Even the cluster is stopped, cloud platform may take a while to # delete the VM. - f'sleep {BUMP_UP_SECONDS}', + f'sleep {smoke_tests_utils.BUMP_UP_SECONDS}', # Launch again. Do full output validation - we expect the cluster to re-launch - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}', + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', f'sky logs {name} 2 --status', f'sky status -r {name} | grep UP', ], f'sky down -y {name}', - timeout=get_timeout(generic_cloud) + autostop_timeout, + timeout=smoke_tests_utils.get_timeout(generic_cloud) + autostop_timeout, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ------------ Test stale job ------------ @@ -165,14 +156,14 @@ def test_launch_fast_with_autostop(generic_cloud: str): @pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances @pytest.mark.no_kubernetes # Kubernetes does not support stopping instances def test_stale_job(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'stale_job', [ f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', f'sky exec {name} -d "echo start; sleep 10000"', f'sky stop {name} -y', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=100), @@ -182,16 +173,16 @@ def test_stale_job(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_aws_stale_job_manual_restart(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.AWS.max_cluster_name_length()) region = 'us-east-2' - test = Test( + test = smoke_tests_utils.Test( 'aws_stale_job_manual_restart', [ f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"', @@ -203,7 +194,7 @@ def test_aws_stale_job_manual_restart(): '--output text`; ' f'aws ec2 stop-instances --region {region} ' '--instance-ids $id', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=40), @@ -211,6 +202,7 @@ def test_aws_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. + smoke_tests_utils. get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, job_status=[sky.JobStatus.FAILED_DRIVER], @@ -218,12 +210,12 @@ def test_aws_stale_job_manual_restart(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_stale_job_manual_restart(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.GCP.max_cluster_name_length()) zone = 'us-west2-a' @@ -232,7 +224,7 @@ def test_gcp_stale_job_manual_restart(): f'--zones={zone} --format="value(name)"') stop_cmd = (f'gcloud compute instances stop --zone={zone}' f' --quiet $({query_cmd})') - test = Test( + test = smoke_tests_utils.Test( 'gcp_stale_job_manual_restart', [ f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"', @@ -244,6 +236,7 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. + smoke_tests_utils. get_cmd_wait_until_job_status_contains_without_matching_job( cluster_name=name, job_status=[sky.JobStatus.FAILED_DRIVER], @@ -251,16 +244,16 @@ def test_gcp_stale_job_manual_restart(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Check Sky's environment variables; workdir. ---------- @pytest.mark.no_fluidstack # Requires amazon S3 @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_env_check(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = Test( + test = smoke_tests_utils.Test( 'env_check', [ f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml', @@ -269,19 +262,19 @@ def test_env_check(generic_cloud: str): f'sky down -y {name}', timeout=total_timeout_minutes * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- CLI logs ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead. def test_cli_logs(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() num_nodes = 2 if generic_cloud == 'kubernetes': # Kubernetes does not support multi-node num_nodes = 1 timestamp = time.time() - test = Test('cli_logs', [ + test = smoke_tests_utils.Test('cli_logs', [ f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"', f'sky exec {name} "echo {timestamp} 2"', f'sky exec {name} "echo {timestamp} 3"', @@ -292,17 +285,17 @@ def test_cli_logs(generic_cloud: str): f'sky logs {name} 1 | grep "{timestamp} 1"', f'sky logs {name} | grep "{timestamp} 4"', ], f'sky down -y {name}') - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.scp def test_scp_logs(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() timestamp = time.time() - test = Test( + test = smoke_tests_utils.Test( 'SCP_cli_logs', [ - f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"', + f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} "echo {timestamp} 1"', f'sky exec {name} "echo {timestamp} 2"', f'sky exec {name} "echo {timestamp} 3"', f'sky exec {name} "echo {timestamp} 4"', @@ -314,7 +307,7 @@ def test_scp_logs(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ------- Testing the core API -------- @@ -324,7 +317,7 @@ def test_scp_logs(): @pytest.mark.gcp def test_core_api_sky_launch_exec(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() task = sky.Task(run="whoami") task.set_resources(sky.Resources(cloud=sky.GCP())) job_id, handle = sky.launch(task, cluster_name=name) @@ -347,7 +340,7 @@ def test_core_api_sky_launch_exec(): # The sky launch CLI has some additional checks to make sure the cluster is up/ # restarted. However, the core API doesn't have these; make sure it still works def test_core_api_sky_launch_fast(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud) try: task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud)) @@ -356,7 +349,7 @@ def test_core_api_sky_launch_fast(generic_cloud: str): idle_minutes_to_autostop=1, fast=True) # Sleep to let the cluster autostop - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=120) @@ -430,8 +423,8 @@ def test_load_dump_yaml_config_equivalent(self): @pytest.mark.no_fluidstack # Fluidstack does not support K80 gpus for now @pytest.mark.no_paperspace # Paperspace does not support K80 gpus def test_multiple_accelerators_ordered(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multiple-accelerators-ordered', [ f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"', @@ -440,14 +433,14 @@ def test_multiple_accelerators_ordered(): f'sky down -y {name}', timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs @pytest.mark.no_paperspace # Paperspace does not support T4 GPUs def test_multiple_accelerators_ordered_with_default(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multiple-accelerators-ordered', [ f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"', @@ -456,14 +449,14 @@ def test_multiple_accelerators_ordered_with_default(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs @pytest.mark.no_paperspace # Paperspace does not support T4 GPUs def test_multiple_accelerators_unordered(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multiple-accelerators-unordered', [ f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml', @@ -471,14 +464,14 @@ def test_multiple_accelerators_unordered(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs @pytest.mark.no_paperspace # Paperspace does not support T4 GPUs def test_multiple_accelerators_unordered_with_default(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multiple-accelerators-unordered-with-default', [ f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml', @@ -487,13 +480,13 @@ def test_multiple_accelerators_unordered_with_default(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Requires other clouds to be enabled def test_multiple_resources(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multiple-resources', [ f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml', @@ -501,7 +494,7 @@ def test_multiple_resources(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Sky Benchmark ---------- @@ -510,8 +503,8 @@ def test_multiple_resources(): @pytest.mark.no_kubernetes @pytest.mark.aws # SkyBenchmark requires S3 access def test_sky_bench(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'sky-bench', [ f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml', @@ -520,7 +513,7 @@ def test_sky_bench(generic_cloud: str): ], f'sky bench down {name} -y; sky bench delete {name} -y', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes @@ -558,8 +551,8 @@ def test_kubernetes_context_failover(): with tempfile.NamedTemporaryFile(delete=True) as f: f.write(config.encode('utf-8')) f.flush() - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'kubernetes-context-failover', [ # Check if kind-skypilot is provisioned with H100 annotations already @@ -606,4 +599,4 @@ def test_kubernetes_context_failover(): f'sky down -y {name}-1 {name}-3', env={'SKYPILOT_CONFIG': f.name}, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index 8b97ab4eef9..0255884ae30 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -25,19 +25,7 @@ import jinja2 import pytest -from smoke_tests.util import BUMP_UP_SECONDS -from smoke_tests.util import get_aws_region_for_quota_failover -from smoke_tests.util import get_cluster_name -from smoke_tests.util import get_cmd_wait_until_cluster_status_contains -from smoke_tests.util import ( - get_cmd_wait_until_job_status_contains_matching_job_id) -from smoke_tests.util import get_gcp_region_for_quota_failover -from smoke_tests.util import get_timeout -from smoke_tests.util import LAMBDA_TYPE -from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_GPU_V100 -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import Test +from smoke_tests import smoke_tests_utils import sky from sky import AWS @@ -56,8 +44,8 @@ @pytest.mark.no_paperspace # Paperspace does not have T4 gpus. @pytest.mark.no_oci # OCI does not have T4 gpus def test_job_queue(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'job_queue', [ f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml', @@ -78,7 +66,7 @@ def test_job_queue(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Job Queue with Docker. ---------- @@ -106,10 +94,10 @@ def test_job_queue(generic_cloud: str): 'docker:winglian/axolotl:main-latest' ]) def test_job_queue_with_docker(generic_cloud: str, image_id: str): - name = get_cluster_name() + image_id[len('docker:'):][:4] + name = smoke_tests_utils.get_cluster_name() + image_id[len('docker:'):][:4] total_timeout_minutes = 40 if generic_cloud == 'azure' else 15 time_to_sleep = 300 if generic_cloud == 'azure' else 180 - test = Test( + test = smoke_tests_utils.Test( 'job_queue_with_docker', [ f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', @@ -145,16 +133,16 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str): f'sky down -y {name}', timeout=total_timeout_minutes * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.lambda_cloud def test_lambda_job_queue(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'lambda_job_queue', [ - f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml', + f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/job_queue/cluster.yaml', f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml', f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml', f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml', @@ -168,13 +156,13 @@ def test_lambda_job_queue(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.ibm def test_ibm_job_queue(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'ibm_job_queue', [ f'sky launch -y -c {name} --cloud ibm --gpus v100', @@ -191,21 +179,21 @@ def test_ibm_job_queue(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.scp def test_scp_job_queue(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() num_of_gpu_launch = 1 num_of_gpu_exec = 0.5 - test = Test( + test = smoke_tests_utils.Test( 'SCP_job_queue', [ - f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml', + f'sky exec {name} -n {name}-1 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-2 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', + f'sky exec {name} -n {name}-3 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', f'sky queue {name} | grep {name}-1 | grep RUNNING', f'sky queue {name} | grep {name}-2 | grep RUNNING', f'sky queue {name} | grep {name}-3 | grep PENDING', @@ -216,7 +204,7 @@ def test_scp_job_queue(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs @@ -227,9 +215,9 @@ def test_scp_job_queue(): @pytest.mark.no_oci # OCI Cloud does not have T4 gpus. @pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet def test_job_queue_multinode(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 - test = Test( + test = smoke_tests_utils.Test( 'job_queue_multinode', [ f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml', @@ -261,14 +249,14 @@ def test_job_queue_multinode(generic_cloud: str): f'sky down -y {name}', timeout=total_timeout_minutes * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_large_job_queue(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'large_job_queue', [ f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', @@ -306,15 +294,15 @@ def test_large_job_queue(generic_cloud: str): f'sky down -y {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_fast_large_job_queue(generic_cloud: str): # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'fast_large_job_queue', [ f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', @@ -325,14 +313,14 @@ def test_fast_large_job_queue(generic_cloud: str): f'sky down -y {name}', timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.ibm def test_ibm_job_queue_multinode(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() task_file = 'examples/job_queue/job_multinode_ibm.yaml' - test = Test( + test = smoke_tests_utils.Test( 'ibm_job_queue_multinode', [ f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2', @@ -364,7 +352,7 @@ def test_ibm_job_queue_multinode(): f'sky down -y {name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Docker with preinstalled package. ---------- @@ -376,8 +364,8 @@ def test_ibm_job_queue_multinode(): @pytest.mark.no_kubernetes # Doesn't support Kubernetes for now # TODO(zhwu): we should fix this for kubernetes def test_docker_preinstalled_package(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'docker_with_preinstalled_package', [ f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx', @@ -387,7 +375,7 @@ def test_docker_preinstalled_package(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Submitting multiple tasks to the same cluster. ---------- @@ -398,8 +386,8 @@ def test_docker_preinstalled_package(generic_cloud: str): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet @pytest.mark.no_oci # OCI Cloud does not have T4 gpus def test_multi_echo(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multi_echo', [ f'python examples/multi_echo.py {name} {generic_cloud}', @@ -418,6 +406,7 @@ def test_multi_echo(generic_cloud: str): ] + # Ensure jobs succeeded. [ + smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, job_id=i + 1, @@ -430,7 +419,7 @@ def test_multi_echo(generic_cloud: str): f'sky down -y {name}', timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Task: 1 node training. ---------- @@ -439,8 +428,8 @@ def test_multi_echo(generic_cloud: str): @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. def test_huggingface(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'huggingface_glue_imdb_app', [ f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml', @@ -450,47 +439,47 @@ def test_huggingface(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.lambda_cloud def test_lambda_huggingface(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'lambda_huggingface_glue_imdb_app', [ - f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', + f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', + f'sky exec {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.scp def test_scp_huggingface(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() num_of_gpu_launch = 1 - test = Test( + test = smoke_tests_utils.Test( 'SCP_huggingface_glue_imdb_app', [ - f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', + f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', + f'sky exec {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Inferentia. ---------- @pytest.mark.aws def test_inferentia(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test_inferentia', [ f'sky launch -y -c {name} -t inf2.xlarge -- echo hi', @@ -500,15 +489,15 @@ def test_inferentia(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- TPU. ---------- @pytest.mark.gcp @pytest.mark.tpu def test_tpu(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'tpu_app', [ f'sky launch -y -c {name} examples/tpu/tpu_app.yaml', @@ -519,15 +508,15 @@ def test_tpu(): f'sky down -y {name}', timeout=30 * 60, # can take >20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- TPU VM. ---------- @pytest.mark.gcp @pytest.mark.tpu def test_tpu_vm(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'tpu_vm_app', [ f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml', @@ -545,15 +534,15 @@ def test_tpu_vm(): f'sky down -y {name}', timeout=30 * 60, # can take 30 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- TPU VM Pod. ---------- @pytest.mark.gcp @pytest.mark.tpu def test_tpu_vm_pod(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'tpu_pod', [ f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a', @@ -563,14 +552,14 @@ def test_tpu_vm_pod(): f'sky down -y {name}', timeout=30 * 60, # can take 30 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- TPU Pod Slice on GKE. ---------- @pytest.mark.kubernetes def test_tpu_pod_slice_gke(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'tpu_pod_slice_gke', [ f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', @@ -582,15 +571,15 @@ def test_tpu_pod_slice_gke(): f'sky down -y {name}', timeout=30 * 60, # can take 30 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Simple apps. ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_multi_hostname(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = Test( + test = smoke_tests_utils.Test( 'multi_hostname', [ f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml', @@ -600,15 +589,16 @@ def test_multi_hostname(generic_cloud: str): f'sky logs {name} 2 --status', # Ensure the job succeeded. ], f'sky down -y {name}', - timeout=get_timeout(generic_cloud, total_timeout_minutes * 60), + timeout=smoke_tests_utils.get_timeout(generic_cloud, + total_timeout_minutes * 60), ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_multi_node_failure(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'multi_node_failure', [ # TODO(zhwu): we use multi-thread to run the commands in setup @@ -626,14 +616,14 @@ def test_multi_node_failure(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on GCP. ---------- @pytest.mark.gcp def test_gcp_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml', @@ -643,14 +633,14 @@ def test_gcp_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on AWS. ---------- @pytest.mark.aws def test_aws_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml', @@ -660,14 +650,14 @@ def test_aws_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on Azure. ---------- @pytest.mark.azure def test_azure_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'azure_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml', @@ -677,14 +667,14 @@ def test_azure_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on Kubernetes. ---------- @pytest.mark.kubernetes def test_kubernetes_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'kubernetes_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml', @@ -694,14 +684,14 @@ def test_kubernetes_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on Paperspace. ---------- @pytest.mark.paperspace def test_paperspace_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'paperspace_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml', @@ -711,14 +701,14 @@ def test_paperspace_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Web apps with custom ports on RunPod. ---------- @pytest.mark.runpod def test_runpod_http_server_with_custom_ports(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'runpod_http_server_with_custom_ports', [ f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml', @@ -728,13 +718,13 @@ def test_runpod_http_server_with_custom_ports(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Labels from task on AWS (instance_tags) ---------- @pytest.mark.aws def test_task_labels_aws(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() template_str = pathlib.Path( 'tests/test_yamls/test_labels.yaml.j2').read_text() template = jinja2.Template(template_str) @@ -743,7 +733,7 @@ def test_task_labels_aws(): f.write(content) f.flush() file_path = f.name - test = Test( + test = smoke_tests_utils.Test( 'task_labels_aws', [ f'sky launch -y -c {name} {file_path}', @@ -758,13 +748,13 @@ def test_task_labels_aws(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Labels from task on GCP (labels) ---------- @pytest.mark.gcp def test_task_labels_gcp(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() template_str = pathlib.Path( 'tests/test_yamls/test_labels.yaml.j2').read_text() template = jinja2.Template(template_str) @@ -773,7 +763,7 @@ def test_task_labels_gcp(): f.write(content) f.flush() file_path = f.name - test = Test( + test = smoke_tests_utils.Test( 'task_labels_gcp', [ f'sky launch -y -c {name} {file_path}', @@ -785,13 +775,13 @@ def test_task_labels_gcp(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Labels from task on Kubernetes (labels) ---------- @pytest.mark.kubernetes def test_task_labels_kubernetes(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() template_str = pathlib.Path( 'tests/test_yamls/test_labels.yaml.j2').read_text() template = jinja2.Template(template_str) @@ -800,7 +790,7 @@ def test_task_labels_kubernetes(): f.write(content) f.flush() file_path = f.name - test = Test( + test = smoke_tests_utils.Test( 'task_labels_kubernetes', [ f'sky launch -y -c {name} {file_path}', @@ -813,14 +803,14 @@ def test_task_labels_kubernetes(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Pod Annotations on Kubernetes ---------- @pytest.mark.kubernetes def test_add_pod_annotations_for_autodown_with_launch(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'add_pod_annotations_for_autodown_with_launch', [ # Launch Kubernetes cluster with two nodes, each being head node and worker node. @@ -838,13 +828,13 @@ def test_add_pod_annotations_for_autodown_with_launch(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes def test_add_and_remove_pod_annotations_with_autostop(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'add_and_remove_pod_annotations_with_autostop', [ # Launch Kubernetes cluster with two nodes, each being head node and worker node. @@ -871,13 +861,13 @@ def test_add_and_remove_pod_annotations_with_autostop(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Container logs from task on Kubernetes ---------- @pytest.mark.kubernetes def test_container_logs_multinode_kubernetes(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' head_logs = ('kubectl get pods ' f' | grep {name} | grep head | ' @@ -886,7 +876,7 @@ def test_container_logs_multinode_kubernetes(): f' | grep {name} | grep worker |' " awk '{print $1}' | xargs -I {} kubectl logs {}") with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( + test = smoke_tests_utils.Test( 'container_logs_multinode_kubernetes', [ f'sky launch -y -c {name} {task_yaml} --num-nodes 2', @@ -895,18 +885,18 @@ def test_container_logs_multinode_kubernetes(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes def test_container_logs_two_jobs_kubernetes(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' pod_logs = ('kubectl get pods ' f' | grep {name} | grep head |' " awk '{print $1}' | xargs -I {} kubectl logs {}") with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( + test = smoke_tests_utils.Test( 'test_container_logs_two_jobs_kubernetes', [ f'sky launch -y -c {name} {task_yaml}', @@ -925,18 +915,18 @@ def test_container_logs_two_jobs_kubernetes(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes def test_container_logs_two_simultaneous_jobs_kubernetes(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() task_yaml = 'tests/test_yamls/test_k8s_logs.yaml ' pod_logs = ('kubectl get pods ' f' | grep {name} | grep head |' " awk '{print $1}' | xargs -I {} kubectl logs {}") with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = Test( + test = smoke_tests_utils.Test( 'test_container_logs_two_simultaneous_jobs_kubernetes', [ f'sky launch -y -c {name}', @@ -956,7 +946,7 @@ def test_container_logs_two_simultaneous_jobs_kubernetes(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Task: n=2 nodes with setups. ---------- @@ -967,8 +957,8 @@ def test_container_logs_two_simultaneous_jobs_kubernetes(): reason= 'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.') def test_distributed_tf(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'resnet_distributed_tf_app', [ # NOTE: running it twice will hang (sometimes?) - an app-level bug. @@ -978,14 +968,14 @@ def test_distributed_tf(generic_cloud: str): f'sky down -y {name}', timeout=25 * 60, # 25 mins (it takes around ~19 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing GCP start and stop instances ---------- @pytest.mark.gcp def test_gcp_start_stop(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp-start-stop', [ f'sky launch -y -c {name} examples/gcp_start_stop.yaml', @@ -995,14 +985,14 @@ def test_gcp_start_stop(): f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. f'sky logs {name} 3 --status', # Ensure the job succeeded. f'sky stop -y {name}', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=40), f'sky start -y {name} -i 1', f'sky exec {name} examples/gcp_start_stop.yaml', f'sky logs {name} 4 --status', # Ensure the job succeeded. - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[ sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT @@ -1011,14 +1001,14 @@ def test_gcp_start_stop(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing Azure start and stop instances ---------- @pytest.mark.azure def test_azure_start_stop(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'azure-start-stop', [ f'sky launch -y -c {name} examples/azure_start_stop.yaml', @@ -1030,7 +1020,7 @@ def test_azure_start_stop(): f'sky start -y {name} -i 1', f'sky exec {name} examples/azure_start_stop.yaml', f'sky logs {name} 3 --status', # Ensure the job succeeded. - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[ sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT @@ -1041,7 +1031,7 @@ def test_azure_start_stop(): f'sky down -y {name}', timeout=30 * 60, # 30 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing Autostopping ---------- @@ -1051,14 +1041,14 @@ def test_azure_start_stop(): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet @pytest.mark.no_kubernetes # Kubernetes does not autostop yet def test_autostop(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure # the VM is stopped. autostop_timeout = 600 if generic_cloud == 'azure' else 250 # Launching and starting Azure clusters can take a long time too. e.g., restart # a stopped Azure cluster can take 7m. So we set the total timeout to 70m. total_timeout_minutes = 70 if generic_cloud == 'azure' else 20 - test = Test( + test = smoke_tests_utils.Test( 'autostop', [ f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', @@ -1072,7 +1062,7 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=autostop_timeout), @@ -1091,7 +1081,7 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 40', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=autostop_timeout), @@ -1103,27 +1093,27 @@ def test_autostop(generic_cloud: str): 'sleep 45', # Almost reached the threshold. f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], - timeout=autostop_timeout + BUMP_UP_SECONDS), + timeout=autostop_timeout + smoke_tests_utils.BUMP_UP_SECONDS), ], f'sky down -y {name}', timeout=total_timeout_minutes * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing Autodowning ---------- @pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead. def test_autodown(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure # the VM is terminated. autodown_timeout = 900 if generic_cloud == 'azure' else 240 total_timeout_minutes = 90 if generic_cloud == 'azure' else 20 - test = Test( + test = smoke_tests_utils.Test( 'autodown', [ f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', @@ -1152,16 +1142,16 @@ def test_autodown(generic_cloud: str): f'sky down -y {name}', timeout=total_timeout_minutes * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.scp def test_scp_autodown(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'SCP_autodown', [ - f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', + f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml', f'sky autostop -y {name} --down -i 1', # Ensure autostop is set. f'sky status | grep {name} | grep "1m (down)"', @@ -1171,14 +1161,14 @@ def test_scp_autodown(): # Ensure the cluster is terminated. 'sleep 200', f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', + f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml', + f'sky exec {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml', f'sky status | grep {name} | grep "1m (down)"', 'sleep 200', # Ensure the cluster is terminated. f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml', + f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml', f'sky autostop -y {name} --cancel', 'sleep 200', # Ensure the cluster is still UP. @@ -1187,11 +1177,11 @@ def test_scp_autodown(): f'sky down -y {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): - test = Test( + test = smoke_tests_utils.Test( f'{cloud}-cancel-task', [ f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d', @@ -1214,23 +1204,23 @@ def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): # ---------- Testing `sky cancel` ---------- @pytest.mark.aws def test_cancel_aws(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() test = _get_cancel_task_with_cloud(name, 'aws') - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_cancel_gcp(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() test = _get_cancel_task_with_cloud(name, 'gcp') - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_cancel_azure(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now @@ -1239,8 +1229,8 @@ def test_cancel_azure(): @pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_cancel_pytorch(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'cancel-pytorch', [ f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d', @@ -1262,15 +1252,15 @@ def test_cancel_pytorch(generic_cloud: str): f'sky down -y {name}', timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi` # requires a CUDA public image, which IBM doesn't offer @pytest.mark.ibm def test_cancel_ibm(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'ibm-cancel-task', [ f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml', @@ -1283,7 +1273,7 @@ def test_cancel_ibm(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing use-spot option ---------- @@ -1295,8 +1285,8 @@ def test_cancel_ibm(): @pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances def test_use_spot(generic_cloud: str): """Test use-spot and sky exec.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'use-spot', [ f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y', @@ -1306,14 +1296,14 @@ def test_use_spot(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_stop_gcp_spot(): """Test GCP spot can be stopped, autostopped, restarted.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'stop_gcp_spot', [ f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', @@ -1323,7 +1313,7 @@ def test_stop_gcp_spot(): f'sky exec {name} -- ls myfile', f'sky logs {name} 2 --status', f'sky autostop {name} -i0 -y', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=90), @@ -1332,21 +1322,21 @@ def test_stop_gcp_spot(): f'sky logs {name} 3 --status', # -i option at launch should go through: f'sky launch -c {name} -i0 -y', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=120), ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing env ---------- def test_inline_env(generic_cloud: str): """Test env""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-inline-env', [ f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', @@ -1356,16 +1346,16 @@ def test_inline_env(generic_cloud: str): f'sky logs {name} 2 --status', ], f'sky down -y {name}', - get_timeout(generic_cloud), + smoke_tests_utils.get_timeout(generic_cloud), ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing env file ---------- def test_inline_env_file(generic_cloud: str): """Test env""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-inline-env-file', [ f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', @@ -1374,17 +1364,17 @@ def test_inline_env_file(generic_cloud: str): f'sky logs {name} 2 --status', ], f'sky down -y {name}', - get_timeout(generic_cloud), + smoke_tests_utils.get_timeout(generic_cloud), ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing custom image ---------- @pytest.mark.aws def test_aws_custom_image(): """Test AWS custom image""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-aws-custom-image', [ f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a', # Nvidia image @@ -1393,7 +1383,7 @@ def test_aws_custom_image(): f'sky down -y {name}', timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes @@ -1410,8 +1400,8 @@ def test_aws_custom_image(): ]) def test_kubernetes_custom_image(image_id): """Test Kubernetes custom image""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-kubernetes-custom-image', [ f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1', @@ -1424,13 +1414,13 @@ def test_kubernetes_custom_image(image_id): f'sky down -y {name}', timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_azure_start_stop_two_nodes(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'azure-start-stop-two-nodes', [ f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml', @@ -1440,18 +1430,18 @@ def test_azure_start_stop_two_nodes(): f'sky start -y {name} -i 1', f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', f'sky logs {name} 2 --status', # Ensure the job succeeded. - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[ sky.ClusterStatus.INIT, sky.ClusterStatus.STOPPED ], - timeout=200 + BUMP_UP_SECONDS) + + timeout=200 + smoke_tests_utils.BUMP_UP_SECONDS) + f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' ], f'sky down -y {name}', timeout=30 * 60, # 30 mins (it takes around ~23 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing env for disk tier ---------- @@ -1465,11 +1455,11 @@ def _get_aws_query_command(region, instance_id, field, expected): for disk_tier in list(resources_utils.DiskTier): specs = AWS._get_disk_specs(disk_tier) - name = get_cluster_name() + '-' + disk_tier.value + name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.AWS.max_cluster_name_length()) region = 'us-east-2' - test = Test( + test = smoke_tests_utils.Test( 'aws-disk-tier-' + disk_tier.value, [ f'sky launch -y -c {name} --cloud aws --region {region} ' @@ -1488,14 +1478,14 @@ def _get_aws_query_command(region, instance_id, field, expected): f'sky down -y {name}', timeout=10 * 60, # 10 mins (it takes around ~6 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_disk_tier(): for disk_tier in list(resources_utils.DiskTier): disk_types = [GCP._get_disk_type(disk_tier)] - name = get_cluster_name() + '-' + disk_tier.value + name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.GCP.max_cluster_name_length()) region = 'us-west2' @@ -1510,7 +1500,7 @@ def test_gcp_disk_tier(): instance_type_options = ['', '--instance-type n2-standard-64'] for disk_type, instance_type_option in zip(disk_types, instance_type_options): - test = Test( + test = smoke_tests_utils.Test( 'gcp-disk-tier-' + disk_tier.value, [ f'sky launch -y -c {name} --cloud gcp --region {region} ' @@ -1524,7 +1514,7 @@ def test_gcp_disk_tier(): f'sky down -y {name}', timeout=6 * 60, # 6 mins (it takes around ~3 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure @@ -1534,11 +1524,11 @@ def test_azure_disk_tier(): # Azure does not support high and ultra disk tier. continue type = Azure._get_disk_type(disk_tier) - name = get_cluster_name() + '-' + disk_tier.value + name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.Azure.max_cluster_name_length()) region = 'westus2' - test = Test( + test = smoke_tests_utils.Test( 'azure-disk-tier-' + disk_tier.value, [ f'sky launch -y -c {name} --cloud azure --region {region} ' @@ -1550,17 +1540,17 @@ def test_azure_disk_tier(): f'sky down -y {name}', timeout=20 * 60, # 20 mins (it takes around ~12 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_azure_best_tier_failover(): type = Azure._get_disk_type(resources_utils.DiskTier.LOW) - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.Azure.max_cluster_name_length()) region = 'westus2' - test = Test( + test = smoke_tests_utils.Test( 'azure-best-tier-failover', [ f'sky launch -y -c {name} --cloud azure --region {region} ' @@ -1572,15 +1562,15 @@ def test_azure_best_tier_failover(): f'sky down -y {name}', timeout=20 * 60, # 20 mins (it takes around ~12 mins) ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ------ Testing Zero Quota Failover ------ @pytest.mark.aws def test_aws_zero_quota_failover(): - name = get_cluster_name() - region = get_aws_region_for_quota_failover() + name = smoke_tests_utils.get_cluster_name() + region = smoke_tests_utils.get_aws_region_for_quota_failover() if not region: pytest.xfail( @@ -1589,21 +1579,21 @@ def test_aws_zero_quota_failover(): 'expected for your account?') return - test = Test( + test = smoke_tests_utils.Test( 'aws-zero-quota-failover', [ f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"', ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_zero_quota_failover(): - name = get_cluster_name() - region = get_gcp_region_for_quota_failover() + name = smoke_tests_utils.get_cluster_name() + region = smoke_tests_utils.get_gcp_region_for_quota_failover() if not region: pytest.xfail( @@ -1612,18 +1602,18 @@ def test_gcp_zero_quota_failover(): 'expected for your account?') return - test = Test( + test = smoke_tests_utils.Test( 'gcp-zero-quota-failover', [ f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"', ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) def test_long_setup_run_script(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() with tempfile.NamedTemporaryFile('w', prefix='sky_app_', suffix='.yaml') as f: f.write( @@ -1644,7 +1634,7 @@ def test_long_setup_run_script(generic_cloud: str): f.write(' echo "end run"\n') f.flush() - test = Test( + test = smoke_tests_utils.Test( 'long-setup-run-script', [ f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}', @@ -1656,4 +1646,4 @@ def test_long_setup_run_script(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py index 4fa39d98177..27d6a693ae6 100644 --- a/tests/smoke_tests/test_images.py +++ b/tests/smoke_tests/test_images.py @@ -20,11 +20,7 @@ # > pytest tests/smoke_tests/test_images.py --generic-cloud aws import pytest -from smoke_tests.util import get_cluster_name -from smoke_tests.util import get_cmd_wait_until_cluster_is_not_found -from smoke_tests.util import get_cmd_wait_until_cluster_status_contains -from smoke_tests.util import run_one_test -from smoke_tests.util import Test +from smoke_tests import smoke_tests_utils import sky @@ -32,8 +28,8 @@ # ---------- Test the image ---------- @pytest.mark.aws def test_aws_images(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_images', [ f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml', @@ -47,13 +43,13 @@ def test_aws_images(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_images(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_images', [ f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml', @@ -67,13 +63,13 @@ def test_gcp_images(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_azure_images(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'azure_images', [ f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml', @@ -87,13 +83,13 @@ def test_azure_images(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_aws_image_id_dict(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_image_id_dict', [ # Use image id dict. @@ -106,13 +102,13 @@ def test_aws_image_id_dict(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_image_id_dict(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_image_id_dict', [ # Use image id dict. @@ -125,13 +121,13 @@ def test_gcp_image_id_dict(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_aws_image_id_dict_region(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_image_id_dict_region', [ # YAML has @@ -162,13 +158,13 @@ def test_aws_image_id_dict_region(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_image_id_dict_region(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_image_id_dict_region', [ # Use region to filter image_id dict. @@ -195,13 +191,13 @@ def test_gcp_image_id_dict_region(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_aws_image_id_dict_zone(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_image_id_dict_zone', [ # YAML has @@ -233,13 +229,13 @@ def test_aws_image_id_dict_zone(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_image_id_dict_zone(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_image_id_dict_zone', [ # Use zone to filter image_id dict. @@ -267,19 +263,19 @@ def test_gcp_image_id_dict_zone(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_clone_disk_aws(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'clone_disk_aws', [ f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', f'sky stop {name} -y', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=60), @@ -294,13 +290,13 @@ def test_clone_disk_aws(): f'sky down -y {name} {name}-clone {name}-clone-2', timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_clone_disk_gcp(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'clone_disk_gcp', [ f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"', @@ -313,14 +309,14 @@ def test_clone_disk_gcp(): ], f'sky down -y {name} {name}-clone {name}-clone-2', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_mig(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() region = 'us-central1' - test = Test( + test = smoke_tests_utils.Test( 'gcp_mig', [ f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml', @@ -331,8 +327,8 @@ def test_gcp_mig(): # Check MIG exists. f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', f'sky autostop -i 0 --down -y {name}', - get_cmd_wait_until_cluster_is_not_found(cluster_name=name, - timeout=120), + smoke_tests_utils.get_cmd_wait_until_cluster_is_not_found( + cluster_name=name, timeout=120), f'gcloud compute instance-templates list | grep "sky-it-{name}"', # Launch again with the same region. The original instance template # should be removed. @@ -343,12 +339,12 @@ def test_gcp_mig(): ], f'sky down -y {name}', env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'}) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_force_enable_external_ips(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() test_commands = [ f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', # Check network of vm is "default" @@ -361,17 +357,17 @@ def test_gcp_force_enable_external_ips(): f'sky down -y {name}', ] skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml' - test = Test('gcp_force_enable_external_ips', - test_commands, - f'sky down -y {name}', - env={'SKYPILOT_CONFIG': skypilot_config}) - run_one_test(test) + test = smoke_tests_utils.Test('gcp_force_enable_external_ips', + test_commands, + f'sky down -y {name}', + env={'SKYPILOT_CONFIG': skypilot_config}) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_image_no_conda(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'image_no_conda', [ # Use image id dict. @@ -384,14 +380,14 @@ def test_image_no_conda(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation @pytest.mark.no_kubernetes # Kubernetes does not support stopping instances def test_custom_default_conda_env(generic_cloud: str): - name = get_cluster_name() - test = Test('custom_default_conda_env', [ + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test('custom_default_conda_env', [ f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky status -r {name} | grep "UP"', f'sky logs {name} 1 --status', @@ -399,7 +395,7 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', - get_cmd_wait_until_cluster_status_contains( + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( cluster_name=name, cluster_status=[sky.ClusterStatus.STOPPED], timeout=80), @@ -408,4 +404,4 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 3 --status', ], f'sky down -y {name}') - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 4d6f1dd9614..c8ef5c1a502 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -27,20 +27,12 @@ import time import pytest +from smoke_tests import smoke_tests_utils from smoke_tests.test_mount_and_storage import TestStorageWithCredentials -from smoke_tests.util import BUMP_UP_SECONDS -from smoke_tests.util import get_cluster_name -from smoke_tests.util import ( - get_cmd_wait_until_managed_job_status_contains_matching_job_name) -from smoke_tests.util import GET_JOB_QUEUE -from smoke_tests.util import JOB_WAIT_NOT_RUNNING -from smoke_tests.util import run_one_test -from smoke_tests.util import STORAGE_SETUP_COMMANDS -from smoke_tests.util import Test +import sky from sky import jobs from sky.data import storage as storage_lib -from sky.jobs.state import ManagedJobStatus from sky.skylet import constants from sky.utils import common_utils @@ -52,35 +44,40 @@ @pytest.mark.managed_jobs def test_managed_jobs(generic_cloud: str): """Test the managed jobs yaml.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'managed-jobs', [ f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, - ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING + sky.ManagedJobStatus.PENDING, + sky.ManagedJobStatus.SUBMITTED, + sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING ], timeout=60), + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', job_status=[ - ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED, - ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING + sky.ManagedJobStatus.PENDING, + sky.ManagedJobStatus.SUBMITTED, + sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING ], timeout=60), f'sky jobs cancel -y -n {name}-1', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-1', - job_status=[ManagedJobStatus.CANCELLED], + job_status=[sky.ManagedJobStatus.CANCELLED], timeout=230), # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', - f'{GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', ], # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when # canceling multiple job names is supported. @@ -88,7 +85,7 @@ def test_managed_jobs(generic_cloud: str): # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack #fluidstack does not support spot instances @@ -100,36 +97,36 @@ def test_managed_jobs(generic_cloud: str): @pytest.mark.managed_jobs def test_job_pipeline(generic_cloud: str): """Test a job pipeline.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'spot-pipeline', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d', 'sleep 5', - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', # `grep -A 4 {name}` finds the job with {name} and the 4 lines # after it, i.e. the 4 tasks within the job. # `sed -n 2p` gets the second line of the 4 lines, i.e. the first # task within the job. - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', f'sky jobs cancel -y -n {name}', 'sleep 5', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', 'sleep 200', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack #fluidstack does not support spot instances @@ -141,22 +138,23 @@ def test_job_pipeline(generic_cloud: str): @pytest.mark.managed_jobs def test_managed_jobs_failed_setup(generic_cloud: str): """Test managed job with failed setup.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'managed_jobs_failed_setup', [ f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', # Make sure the job failed quickly. + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.FAILED_SETUP], - timeout=330 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.FAILED_SETUP], + timeout=330 + smoke_tests_utils.BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack #fluidstack does not support spot instances @@ -168,31 +166,32 @@ def test_managed_jobs_failed_setup(generic_cloud: str): @pytest.mark.managed_jobs def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): """Test managed job with failed setup for a pipeline.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'managed_jobs_pipeline_failed_setup', [ f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.FAILED_SETUP], + job_status=[sky.ManagedJobStatus.FAILED_SETUP], timeout=600), # Make sure the job failed quickly. - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', # Task 0 should be SUCCEEDED. - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', # Task 1 should be FAILED_SETUP. - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', # Task 2 should be CANCELLED. - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', # Task 3 should be CANCELLED. - f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing managed job recovery ---------- @@ -202,17 +201,18 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): @pytest.mark.managed_jobs def test_managed_jobs_recovery_aws(aws_config_region): """Test managed job recovery.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) region = aws_config_region - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_recovery_aws', [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=600), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. @@ -221,25 +221,26 @@ def test_managed_jobs_recovery_aws(aws_config_region): f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @pytest.mark.managed_jobs def test_managed_jobs_recovery_gcp(): """Test managed job recovery.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) zone = 'us-east4-b' @@ -250,48 +251,51 @@ def test_managed_jobs_recovery_gcp(): f'--zones={zone} --format="value(name)"') terminate_cmd = (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_recovery_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=300), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws @pytest.mark.managed_jobs def test_managed_jobs_pipeline_recovery_aws(aws_config_region): """Test managed job recovery for a pipeline.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() user_hash = common_utils.get_user_hash() user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] region = aws_config_region if region != 'us-east-2': pytest.skip('Only run spot pipeline recovery test in us-east-2') - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_pipeline_recovery_aws', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -309,11 +313,12 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): f'-{user_hash} ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -323,14 +328,14 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @pytest.mark.managed_jobs def test_managed_jobs_pipeline_recovery_gcp(): """Test managed job recovery for a pipeline.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() zone = 'us-east4-b' user_hash = common_utils.get_user_hash() user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] @@ -340,13 +345,14 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'--zones={zone} --format="value(name)"') terminate_cmd = (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_pipeline_recovery_gcp', [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', @@ -356,11 +362,12 @@ def test_managed_jobs_pipeline_recovery_gcp(): # separated by `-`. (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=200), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', @@ -370,7 +377,7 @@ def test_managed_jobs_pipeline_recovery_gcp(): f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack does not support spot instances @@ -382,39 +389,42 @@ def test_managed_jobs_pipeline_recovery_gcp(): @pytest.mark.managed_jobs def test_managed_jobs_recovery_default_resources(generic_cloud: str): """Test managed job recovery for default resources.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'managed-spot-recovery-default-resources', [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ - ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING + sky.ManagedJobStatus.RUNNING, + sky.ManagedJobStatus.RECOVERING ], timeout=360), ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws @pytest.mark.managed_jobs def test_managed_jobs_recovery_multi_node_aws(aws_config_region): """Test managed job recovery.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) region = aws_config_region - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_recovery_multi_node_aws', [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=450), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. @@ -424,25 +434,26 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'Name=tag:ray-node-type,Values=worker ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @pytest.mark.managed_jobs def test_managed_jobs_recovery_multi_node_gcp(): """Test managed job recovery.""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) zone = 'us-west2-a' @@ -453,35 +464,37 @@ def test_managed_jobs_recovery_multi_node_gcp(): f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"') terminate_cmd = (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_recovery_multi_node_gcp', [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=400), f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, - JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.RUNNING], + job_status=[sky.ManagedJobStatus.RUNNING], timeout=560), f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws @pytest.mark.managed_jobs def test_managed_jobs_cancellation_aws(aws_config_region): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) name_2_on_cloud = common_utils.make_cluster_name_on_cloud( @@ -489,22 +502,24 @@ def test_managed_jobs_cancellation_aws(aws_config_region): name_3_on_cloud = common_utils.make_cluster_name_on_cloud( f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) region = aws_config_region - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_cancellation_aws', [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ - ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING + sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING ], - timeout=60 + BUMP_UP_SECONDS), + timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -513,15 +528,17 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=[ManagedJobStatus.RUNNING], - timeout=300 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.RUNNING], + timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -530,23 +547,25 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', # The job is running in the cluster, will shown as RUNNING. + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=[ManagedJobStatus.RUNNING], - timeout=300 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.RUNNING], + timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$(aws ec2 describe-instances --region {region} ' @@ -556,13 +575,13 @@ def test_managed_jobs_cancellation_aws(aws_config_region): ), ], timeout=25 * 60) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @pytest.mark.managed_jobs def test_managed_jobs_cancellation_gcp(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() name_3 = f'{name}-3' name_3_on_cloud = common_utils.make_cluster_name_on_cloud( name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) @@ -576,54 +595,60 @@ def test_managed_jobs_cancellation_gcp(): f'--zones={zone} --format="value(name)"') terminate_cmd = (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_cancellation_gcp', [ # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.STARTING], - timeout=60 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.STARTING], + timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', # The job is set up in the cluster, will shown as RUNNING. + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=[ManagedJobStatus.RUNNING], - timeout=300 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.RUNNING], + timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), f'sky jobs cancel -y -n {name}-2', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-2', - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=[ManagedJobStatus.RUNNING], - timeout=300 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.RUNNING], + timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), # Terminate the cluster manually. terminate_cmd, - JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', f'sky jobs cancel -y -n {name}-3', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=f'{name}-3', - job_status=[ManagedJobStatus.CANCELLED], - timeout=120 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.CANCELLED], + timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' ), ], timeout=25 * 60) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing storage for managed job ---------- @@ -635,7 +660,7 @@ def test_managed_jobs_cancellation_gcp(): @pytest.mark.managed_jobs def test_managed_jobs_storage(generic_cloud: str): """Test storage with managed job""" - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() yaml_str = pathlib.Path( 'examples/managed_job_with_storage.yaml').read_text() timestamp = int(time.time()) @@ -700,16 +725,17 @@ def test_managed_jobs_storage(generic_cloud: str): f.write(yaml_str) f.flush() file_path = f.name - test = Test( + test = smoke_tests_utils.Test( 'managed_jobs_storage', [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.SUCCEEDED], - timeout=60 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.SUCCEEDED], + timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), # Wait for the job to be cleaned up. 'sleep 20', f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', @@ -721,7 +747,7 @@ def test_managed_jobs_storage(generic_cloud: str): # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing spot TPU ---------- @@ -730,43 +756,46 @@ def test_managed_jobs_storage(generic_cloud: str): @pytest.mark.tpu def test_managed_jobs_tpu(): """Test managed job on TPU.""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-spot-tpu', [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.STARTING], - timeout=60 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.STARTING], + timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), # TPU takes a while to launch + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ - ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED + sky.ManagedJobStatus.RUNNING, sky.ManagedJobStatus.SUCCEEDED ], - timeout=900 + BUMP_UP_SECONDS), + timeout=900 + smoke_tests_utils.BUMP_UP_SECONDS), ], f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing env for managed jobs ---------- @pytest.mark.managed_jobs def test_managed_jobs_inline_env(generic_cloud: str): """Test managed jobs env""" - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test-managed-jobs-inline-env', [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\\$TEST_ENV"; ([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, - job_status=[ManagedJobStatus.SUCCEEDED], - timeout=20 + BUMP_UP_SECONDS), + job_status=[sky.ManagedJobStatus.SUCCEEDED], + timeout=20 + smoke_tests_utils.BUMP_UP_SECONDS), f'JOB_ROW=$(sky jobs queue | grep {name} | head -n1) && ' f'echo "$JOB_ROW" && echo "$JOB_ROW" | grep "SUCCEEDED" && ' f'JOB_ID=$(echo "$JOB_ROW" | awk \'{{print $1}}\') && ' @@ -780,4 +809,4 @@ def test_managed_jobs_inline_env(generic_cloud: str): # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py index 6a2f0944fec..4889bdcc85e 100644 --- a/tests/smoke_tests/test_mount_and_storage.py +++ b/tests/smoke_tests/test_mount_and_storage.py @@ -32,12 +32,7 @@ import jinja2 import pytest -from smoke_tests.util import get_cluster_name -from smoke_tests.util import get_timeout -from smoke_tests.util import run_one_test -from smoke_tests.util import SCP_TYPE -from smoke_tests.util import STORAGE_SETUP_COMMANDS -from smoke_tests.util import Test +from smoke_tests import smoke_tests_utils import sky from sky import global_user_state @@ -52,7 +47,7 @@ # ---------- file_mounts ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. def test_file_mounts(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() extra_flags = '' if generic_cloud in 'kubernetes': # Kubernetes does not support multi-node @@ -60,42 +55,42 @@ def test_file_mounts(generic_cloud: str): # arm64 (e.g., Apple Silicon) since goofys does not work on arm64. extra_flags = '--num-nodes 1' test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. ] - test = Test( + test = smoke_tests_utils.Test( 'using_file_mounts', test_commands, f'sky down -y {name}', - get_timeout(generic_cloud, 20 * 60), # 20 mins + smoke_tests_utils.get_timeout(generic_cloud, 20 * 60), # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.scp def test_scp_file_mounts(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() test_commands = [ - *STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml', + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, + f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. ] - test = Test( + test = smoke_tests_utils.Test( 'SCP_using_file_mounts', test_commands, f'sky down -y {name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Requires GCP to be enabled def test_using_file_mounts_with_env_vars(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() storage_name = TestStorageWithCredentials.generate_bucket_name() test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} ' 'examples/using_file_mounts_with_env_vars.yaml ' f'--env MY_BUCKET={storage_name}'), @@ -107,20 +102,20 @@ def test_using_file_mounts_with_env_vars(generic_cloud: str): '--env MY_LOCAL_PATH=tmpfile'), f'sky logs {name}-2 1 --status', # Ensure the job succeeded. ] - test = Test( + test = smoke_tests_utils.Test( 'using_file_mounts_with_env_vars', test_commands, (f'sky down -y {name} {name}-2', f'sky storage delete -y {storage_name} {storage_name}-2'), timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- storage ---------- @pytest.mark.aws def test_aws_storage_mounts_with_stop(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() cloud = 'aws' storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( @@ -132,7 +127,7 @@ def test_aws_storage_mounts_with_stop(): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {cloud} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'aws s3 ls {storage_name}/hello.txt', @@ -142,18 +137,18 @@ def test_aws_storage_mounts_with_stop(): # the mounted directory f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' ] - test = Test( + test = smoke_tests_utils.Test( 'aws_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_storage_mounts_with_stop(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() cloud = 'gcp' storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( @@ -165,7 +160,7 @@ def test_gcp_storage_mounts_with_stop(): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {cloud} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'gsutil ls gs://{storage_name}/hello.txt', @@ -175,18 +170,18 @@ def test_gcp_storage_mounts_with_stop(): # the mounted directory f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' ] - test = Test( + test = smoke_tests_utils.Test( 'gcp_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_azure_storage_mounts_with_stop(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() cloud = 'azure' storage_name = f'sky-test-{int(time.time())}' default_region = 'eastus' @@ -203,7 +198,7 @@ def test_azure_storage_mounts_with_stop(): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {cloud} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)' @@ -215,13 +210,13 @@ def test_azure_storage_mounts_with_stop(): # the mounted directory f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' ] - test = Test( + test = smoke_tests_utils.Test( 'azure_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes @@ -229,7 +224,7 @@ def test_kubernetes_storage_mounts(): # Tests bucket mounting on k8s, assuming S3 is configured. # This test will fail if run on non x86_64 architecture, since goofys is # built for x86_64 only. - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() @@ -240,24 +235,24 @@ def test_kubernetes_storage_mounts(): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud kubernetes {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'aws s3 ls {storage_name}/hello.txt || ' f'gsutil ls gs://{storage_name}/hello.txt', ] - test = Test( + test = smoke_tests_utils.Test( 'kubernetes_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes def test_kubernetes_context_switch(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() new_context = f'sky-test-context-{int(time.time())}' new_namespace = f'sky-test-namespace-{int(time.time())}' @@ -301,13 +296,13 @@ def test_kubernetes_context_switch(): 'rm /tmp/sky_test_current_context; ' f'sky down -y {name}') - test = Test( + test = smoke_tests_utils.Test( 'kubernetes_context_switch', test_commands, cleanup_commands, timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.parametrize( @@ -323,7 +318,7 @@ def test_kubernetes_context_switch(): ]) def test_docker_storage_mounts(generic_cloud: str, image_id: str): # Tests bucket mounting on docker container - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() timestamp = str(time.time()).replace('.', '') storage_name = f'sky-test-{timestamp}' template_str = pathlib.Path( @@ -354,7 +349,7 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. # Check AWS, GCP, or Azure storage mount. @@ -362,18 +357,18 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str): f'{gsutil_command} || ' f'{azure_blob_command}', ] - test = Test( + test = smoke_tests_utils.Test( 'docker_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.cloudflare def test_cloudflare_storage_mounts(generic_cloud: str): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( 'tests/test_yamls/test_r2_storage_mounting.yaml').read_text() @@ -385,24 +380,24 @@ def test_cloudflare_storage_mounts(generic_cloud: str): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2' ] - test = Test( + test = smoke_tests_utils.Test( 'cloudflare_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.ibm def test_ibm_storage_mounts(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() storage_name = f'sky-test-{int(time.time())}' bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( storage_name, Rclone.RcloneClouds.IBM) @@ -415,18 +410,18 @@ def test_ibm_storage_mounts(): f.flush() file_path = f.name test_commands = [ - *STORAGE_SETUP_COMMANDS, + *smoke_tests_utils.STORAGE_SETUP_COMMANDS, f'sky launch -y -c {name} --cloud ibm {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt', ] - test = Test( + test = smoke_tests_utils.Test( 'ibm_storage_mounts', test_commands, f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Testing Storage ---------- diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py index e2e58bb3c62..706cd3bb64a 100644 --- a/tests/smoke_tests/test_region_and_zone.py +++ b/tests/smoke_tests/test_region_and_zone.py @@ -23,12 +23,7 @@ import textwrap import pytest -from smoke_tests.util import get_cluster_name -from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard -from smoke_tests.util import ( - get_cmd_wait_until_managed_job_status_contains_matching_job_name) -from smoke_tests.util import run_one_test -from smoke_tests.util import Test +from smoke_tests import smoke_tests_utils import sky from sky.skylet import constants @@ -37,8 +32,8 @@ # ---------- Test region ---------- @pytest.mark.aws def test_aws_region(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_region', [ f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml', @@ -53,12 +48,12 @@ def test_aws_region(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws def test_aws_with_ssh_proxy_command(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() with tempfile.NamedTemporaryFile(mode='w') as f: f.write( @@ -67,7 +62,7 @@ def test_aws_with_ssh_proxy_command(): ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name} """)) f.flush() - test = Test( + test = smoke_tests_utils.Test( 'aws_with_ssh_proxy_command', [ f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1', @@ -81,11 +76,13 @@ def test_aws_with_ssh_proxy_command(): f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', # Wait other tests to create the job controller first, so that # the job controller is not launched with proxy command. + smoke_tests_utils. get_cmd_wait_until_cluster_status_contains_wildcard( cluster_name_wildcard='sky-jobs-controller-*', cluster_status=[sky.ClusterStatus.UP], timeout=300), f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', + smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=name, job_status=[ @@ -97,13 +94,13 @@ def test_aws_with_ssh_proxy_command(): ], f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_region_and_service_account(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_region', [ f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml', @@ -120,14 +117,14 @@ def test_gcp_region_and_service_account(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.ibm def test_ibm_region(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() region = 'eu-de' - test = Test( + test = smoke_tests_utils.Test( 'region', [ f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml', @@ -137,13 +134,13 @@ def test_ibm_region(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure def test_azure_region(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'azure_region', [ f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml', @@ -160,14 +157,14 @@ def test_azure_region(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # ---------- Test zone ---------- @pytest.mark.aws def test_aws_zone(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'aws_zone', [ f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b', @@ -177,14 +174,14 @@ def test_aws_zone(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.ibm def test_ibm_zone(): - name = get_cluster_name() + name = smoke_tests_utils.get_cluster_name() zone = 'eu-de-2' - test = Test( + test = smoke_tests_utils.Test( 'zone', [ f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}', @@ -194,13 +191,13 @@ def test_ibm_zone(): ], f'sky down -y {name} {name}-2 {name}-3', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp def test_gcp_zone(): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'gcp_zone', [ f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', @@ -210,4 +207,4 @@ def test_gcp_zone(): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py index ffaf75e7cbc..1d68b8a81e6 100644 --- a/tests/smoke_tests/test_required_before_merge.py +++ b/tests/smoke_tests/test_required_before_merge.py @@ -19,21 +19,18 @@ # Change cloud for generic tests to aws # > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws -from smoke_tests.util import get_cluster_name -from smoke_tests.util import ( - get_cmd_wait_until_job_status_contains_matching_job_id) -from smoke_tests.util import run_one_test -from smoke_tests.util import Test +from smoke_tests import smoke_tests_utils import sky def test_yaml_launch_and_mount(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', [ f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml', + smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, job_id=1, @@ -43,4 +40,4 @@ def test_yaml_launch_and_mount(generic_cloud: str): f'sky down -y {name}', timeout=5 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py index f56d9bb96ee..5f34eba8728 100644 --- a/tests/smoke_tests/test_sky_serve.py +++ b/tests/smoke_tests/test_sky_serve.py @@ -28,11 +28,7 @@ from typing import List, Tuple import pytest -from smoke_tests.util import get_cluster_name -from smoke_tests.util import run_one_test -from smoke_tests.util import terminate_gcp_replica -from smoke_tests.util import Test -from smoke_tests.util import test_id +from smoke_tests import smoke_tests_utils from sky import serve from sky.utils import common_utils @@ -49,7 +45,7 @@ def _get_service_name() -> str: test_name = caller_func_name.replace('_', '-').replace('test-', 't-') test_name = test_name.replace('skyserve-', 'ss-') test_name = common_utils.make_cluster_name_on_cloud(test_name, 24) - return f'{test_name}-{test_id}' + return f'{test_name}-{smoke_tests_utils.test_id}' # We check the output of the skyserve service to see if it is ready. Output of @@ -107,8 +103,8 @@ def _get_replica_ip(name: str, replica_id: int) -> str: def _get_skyserve_http_test(name: str, cloud: str, - timeout_minutes: int) -> Test: - test = Test( + timeout_minutes: int) -> smoke_tests_utils.Test: + test = smoke_tests_utils.Test( f'test-skyserve-{cloud.replace("_", "-")}', [ f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml', @@ -161,7 +157,7 @@ def test_skyserve_gcp_http(): """Test skyserve on GCP""" name = _get_service_name() test = _get_skyserve_http_test(name, 'gcp', 20) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.aws @@ -170,7 +166,7 @@ def test_skyserve_aws_http(): """Test skyserve on AWS""" name = _get_service_name() test = _get_skyserve_http_test(name, 'aws', 20) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.azure @@ -179,7 +175,7 @@ def test_skyserve_azure_http(): """Test skyserve on Azure""" name = _get_service_name() test = _get_skyserve_http_test(name, 'azure', 30) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.kubernetes @@ -188,7 +184,7 @@ def test_skyserve_kubernetes_http(): """Test skyserve on Kubernetes""" name = _get_service_name() test = _get_skyserve_http_test(name, 'kubernetes', 30) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.oci @@ -197,7 +193,7 @@ def test_skyserve_oci_http(): """Test skyserve on OCI""" name = _get_service_name() test = _get_skyserve_http_test(name, 'oci', 20) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now @@ -218,7 +214,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: encoding='utf-8') as f: prompt2output = json.load(f) - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-llm', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml', @@ -231,7 +227,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: _TEARDOWN_SERVICE.format(name=name), timeout=40 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @@ -240,14 +236,14 @@ def test_skyserve_spot_recovery(): name = _get_service_name() zone = 'us-central1-a' - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-spot-recovery-gcp', [ f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - terminate_gcp_replica(name, zone, 1), + smoke_tests_utils.terminate_gcp_replica(name, zone, 1), _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', @@ -255,7 +251,7 @@ def test_skyserve_spot_recovery(): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Fluidstack does not support spot instances @@ -263,7 +259,7 @@ def test_skyserve_spot_recovery(): @pytest.mark.no_kubernetes def test_skyserve_base_ondemand_fallback(generic_cloud: str): name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-base-ondemand-fallback', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml', @@ -274,7 +270,7 @@ def test_skyserve_base_ondemand_fallback(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @@ -283,7 +279,7 @@ def test_skyserve_dynamic_ondemand_fallback(): name = _get_service_name() zone = 'us-central1-a' - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-dynamic-ondemand-fallback', [ f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml', @@ -302,7 +298,7 @@ def test_skyserve_dynamic_ondemand_fallback(): _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), _check_replica_in_status(name, [(2, True, 'READY'), (0, False, '')]), - terminate_gcp_replica(name, zone, 1), + smoke_tests_utils.terminate_gcp_replica(name, zone, 1), f'sleep 40', # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning). f'{_SERVE_STATUS_WAIT.format(name=name)}; ' @@ -320,7 +316,7 @@ def test_skyserve_dynamic_ondemand_fallback(): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs @@ -330,7 +326,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str): """Tests that we restart the service after user bug.""" # TODO(zhwu): this behavior needs some rethinking. name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-user-bug-restart', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml', @@ -355,7 +351,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve @@ -363,7 +359,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str): def test_skyserve_load_balancer(generic_cloud: str): """Test skyserve load balancer round-robin policy""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-load-balancer', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml', @@ -378,7 +374,7 @@ def test_skyserve_load_balancer(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.gcp @@ -388,7 +384,7 @@ def test_skyserve_auto_restart(): """Test skyserve with auto restart""" name = _get_service_name() zone = 'us-central1-a' - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-auto-restart', [ # TODO(tian): we can dynamically generate YAML from template to @@ -400,7 +396,7 @@ def test_skyserve_auto_restart(): # sleep for 20 seconds (initial delay) to make sure it will # be restarted f'sleep 20', - terminate_gcp_replica(name, zone, 1), + smoke_tests_utils.terminate_gcp_replica(name, zone, 1), # Wait for consecutive failure timeout passed. # If the cluster is not using spot, it won't check the cluster status # on the cloud (since manual shutdown is not a common behavior and such @@ -421,7 +417,7 @@ def test_skyserve_auto_restart(): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve @@ -429,7 +425,7 @@ def test_skyserve_cancel(generic_cloud: str): """Test skyserve with cancel""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-cancel', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml', @@ -446,14 +442,14 @@ def test_skyserve_cancel(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve def test_skyserve_streaming(generic_cloud: str): """Test skyserve with streaming""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-streaming', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml', @@ -465,14 +461,14 @@ def test_skyserve_streaming(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve def test_skyserve_readiness_timeout_fail(generic_cloud: str): """Test skyserve with large readiness probe latency, expected to fail""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-readiness-timeout-fail', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml', @@ -488,14 +484,14 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve def test_skyserve_large_readiness_timeout(generic_cloud: str): """Test skyserve with customized large readiness timeout""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-large-readiness-timeout', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', @@ -506,7 +502,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs @@ -515,7 +511,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): def test_skyserve_update(generic_cloud: str): """Test skyserve with update""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-update', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', @@ -536,7 +532,7 @@ def test_skyserve_update(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs @@ -548,7 +544,7 @@ def test_skyserve_rolling_update(generic_cloud: str): single_new_replica = _check_replica_in_status( name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), (1, False, 'SHUTTING_DOWN')]) - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-rolling-update', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', @@ -574,7 +570,7 @@ def test_skyserve_rolling_update(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack @@ -583,7 +579,7 @@ def test_skyserve_fast_update(generic_cloud: str): """Test skyserve with fast update (Increment version of old replicas)""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-fast-update', [ f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml', @@ -616,14 +612,14 @@ def test_skyserve_fast_update(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.serve def test_skyserve_update_autoscale(generic_cloud: str): """Test skyserve update with autoscale""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-update-autoscale', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', @@ -652,7 +648,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) @pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack @@ -690,7 +686,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): (2, False, 'READY')]) + _check_service_version(name, "1"), ] - test = Test( + test = smoke_tests_utils.Test( f'test-skyserve-new-autoscaler-update-{mode}', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml', @@ -716,7 +712,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs @@ -726,7 +722,7 @@ def test_skyserve_failures(generic_cloud: str): """Test replica failure statuses""" name = _get_service_name() - test = Test( + test = smoke_tests_utils.Test( 'test-skyserve-failures', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml', @@ -764,7 +760,7 @@ def test_skyserve_failures(generic_cloud: str): _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) # TODO(Ziming, Tian): Add tests for autoscaling. @@ -772,8 +768,8 @@ def test_skyserve_failures(generic_cloud: str): # ------- Testing user dependencies -------- def test_user_dependencies(generic_cloud: str): - name = get_cluster_name() - test = Test( + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( 'user-dependencies', [ f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"', @@ -792,4 +788,4 @@ def test_user_dependencies(generic_cloud: str): ], f'sky down -y {name}', ) - run_one_test(test) + smoke_tests_utils.run_one_test(test) From 7db0579cc5dba21007b68404bc35da01e60358c6 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 5 Dec 2024 14:25:17 +0800 Subject: [PATCH 55/64] generate all cloud --- .buildkite/generate_pipeline.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index f2edae5dfca..d3070ab91f8 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -141,8 +141,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: return function_cloud_map -def _generate_pipeline(test_file: str, - one_cloud_per_test_function: bool) -> Dict[str, Any]: +def _generate_pipeline(test_file: str) -> Dict[str, Any]: """Generate a Buildkite pipeline from test files.""" steps = [] function_cloud_map = _extract_marked_tests(test_file) @@ -160,8 +159,6 @@ def _generate_pipeline(test_file: str, 'if': f'build.env("{cloud}") == "1"' } steps.append(step) - if one_cloud_per_test_function: - break return {'steps': steps} @@ -191,8 +188,7 @@ def _convert_release(test_files: List[str]): output_file_pipelines_map = defaultdict(list) for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') - # We only need to run one cloud per test function. - pipeline = _generate_pipeline(test_file, True) + pipeline = _generate_pipeline(test_file) output_file_pipelines_map[yaml_file_path].append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') # Enable all clouds by default for release pipeline. @@ -208,7 +204,7 @@ def _convert_pre_merge(test_files: List[str]): # We want enable all clouds by default for each test function # for pre-merge. And let the author controls which clouds # to run by parameter. - pipeline = _generate_pipeline(test_file, False) + pipeline = _generate_pipeline(test_file) pipeline['steps'].append({ 'label': 'Backward compatibility test', 'command': 'bash tests/backward_compatibility_tests.sh', From 4428c90115ac63de71e1661e1f92e0837be88d26 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 10 Dec 2024 11:17:26 +0800 Subject: [PATCH 56/64] resolve PR comment --- .buildkite/generate_pipeline.py | 86 +++++++++++-------- ...ired_before_merge.py => test_pre_merge.py} | 0 2 files changed, 50 insertions(+), 36 deletions(-) rename tests/smoke_tests/{test_required_before_merge.py => test_pre_merge.py} (100%) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index d3070ab91f8..3446d7d683f 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -5,27 +5,23 @@ tests/smoke_tests ├── test_*.py -> release pipeline -├── test_required_before_merge.py -> pre-merge pipeline +├── test_pre_merge.py -> pre-merge pipeline run `python .buildkite/generate_pipeline.py` to generate the pipeline for testing. The CI will run this script as a pre-step, and use the generated pipeline to run the tests. -1. release pipeline, which runs all smoke tests by default, some function - support tests by multiple clouds, but we only generate one cloud per test - function to save cost. -2. pre-merge pipeline, which generates all clouds supported by the test - function, author should specify which clouds to run by setting env in the - step. +1. release pipeline, which runs all smoke tests by default, generates all + smoke tests for all clouds. +2. pre-merge pipeline, which generates all smoke tests for all clouds, + author should specify which clouds to run by setting env in the step. -We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP) now, -smoke tests for those clouds are generated, other clouds are not supported -yet, smoke tests for those clouds are not generated. +We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP and +SERVE_CLOUD_QUEUE_MAP) now, smoke tests for those clouds are generated, other +clouds are not supported yet, smoke tests for those clouds are not generated. """ import ast -from collections import defaultdict -import copy import os import random from typing import Any, Dict, List, Optional @@ -78,8 +74,19 @@ def _get_full_decorator_path(decorator: ast.AST) -> str: def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: - """Extract test functions and filter clouds with pytest.mark - from a Python test file.""" + """Extract test functions and filter clouds using pytest.mark + from a Python test file. + + We separate each test_function_{cloud} into different pipeline steps + to maximize the parallelism of the tests via the buildkite CI job queue. + This allows us to visualize the test results and rerun failures at the + granularity of each test_function_{cloud}. + + If we make pytest --serve a job, it could contain dozens of test_functions + and run for hours. This makes it hard to visualize the test results and + rerun failures. Additionally, the parallelism would be controlled by pytest + instead of the buildkite job queue. + """ with open(file_path, 'r', encoding='utf-8') as file: tree = ast.parse(file.read(), filename=file_path) @@ -118,7 +125,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: continue clouds_to_include.append(suffix) clouds_to_include = (clouds_to_include if clouds_to_include else - copy.deepcopy(DEFAULT_CLOUDS_TO_RUN)) + DEFAULT_CLOUDS_TO_RUN) clouds_to_include = [ cloud for cloud in clouds_to_include if cloud not in clouds_to_exclude @@ -133,6 +140,14 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: f'but we do not have credentials for those clouds. ' f'Skipped.') continue + if clouds_to_include != final_clouds_to_include: + excluded_clouds = set(clouds_to_include) - set( + final_clouds_to_include) + print( + f'Warning: {file_path}:{node.name} ' + f'is marked to run on {clouds_to_include}, ' + f'but we only have credentials for {final_clouds_to_include}. ' + f'clouds {excluded_clouds} are skipped.') function_name = (f'{class_name}::{node.name}' if class_name else node.name) function_cloud_map[function_name] = (final_clouds_to_include, [ @@ -162,43 +177,41 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]: return {'steps': steps} -def _dump_pipeline_to_file(output_file_pipelines_map: Dict[str, - List[Dict[str, - Any]]], +def _dump_pipeline_to_file(yaml_file_path: str, + pipelines: List[Dict[str, Any]], extra_env: Optional[Dict[str, str]] = None): default_env = {'LOG_TO_STDOUT': '1', 'PYTHONPATH': '${PYTHONPATH}:$(pwd)'} if extra_env: default_env.update(extra_env) - - for yaml_file_path, pipelines in output_file_pipelines_map.items(): - with open(yaml_file_path, 'w', encoding='utf-8') as file: - file.write(GENERATED_FILE_HEAD) - all_steps = [] - for pipeline in pipelines: - all_steps.extend(pipeline['steps']) - # Shuffle the steps to avoid flakyness, consecutive runs of the same - # kind of test may fail for requiring locks on the same resources. - random.shuffle(all_steps) - final_pipeline = {'steps': all_steps, 'env': default_env} - yaml.dump(final_pipeline, file, default_flow_style=False) + with open(yaml_file_path, 'w', encoding='utf-8') as file: + file.write(GENERATED_FILE_HEAD) + all_steps = [] + for pipeline in pipelines: + all_steps.extend(pipeline['steps']) + # Shuffle the steps to avoid flakyness, consecutive runs of the same + # kind of test may fail for requiring locks on the same resources. + random.shuffle(all_steps) + final_pipeline = {'steps': all_steps, 'env': default_env} + yaml.dump(final_pipeline, file, default_flow_style=False) def _convert_release(test_files: List[str]): yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml' - output_file_pipelines_map = defaultdict(list) + output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') pipeline = _generate_pipeline(test_file) - output_file_pipelines_map[yaml_file_path].append(pipeline) + output_file_pipelines.append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') # Enable all clouds by default for release pipeline. - _dump_pipeline_to_file(output_file_pipelines_map, + _dump_pipeline_to_file(yaml_file_path, + output_file_pipelines, extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP}) def _convert_pre_merge(test_files: List[str]): yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' - output_file_pipelines_map = defaultdict(list) + output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') # We want enable all clouds by default for each test function @@ -213,9 +226,10 @@ def _convert_pre_merge(test_files: List[str]): }, 'if': 'build.env("aws") == "1"' }) - output_file_pipelines_map[yaml_file_path].append(pipeline) + output_file_pipelines.append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') - _dump_pipeline_to_file(output_file_pipelines_map, + _dump_pipeline_to_file(yaml_file_path, + output_file_pipelines, extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'}) diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_pre_merge.py similarity index 100% rename from tests/smoke_tests/test_required_before_merge.py rename to tests/smoke_tests/test_pre_merge.py From ce550e70087fb7d310685ddab2d77a43f56fe122 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 10 Dec 2024 12:26:20 +0800 Subject: [PATCH 57/64] update comment --- tests/test_smoke.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 50824da7ec1..2d0f7605bc4 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -25,9 +25,8 @@ # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws """ -# This is the content that will be used in the future. # Currently copy back the tests/smoke_tests/* to tests/test_smoke.py for review. -# After review, we will remove the copy back part and use content below. +# After review, we will remove all contents in this file and use content below. # All files categorized under tests/smoke_tests/* # Please add new test cases under that directory. From e389780c4a355d25e510dd65d1fa7ccdd8426f53 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 10 Dec 2024 12:39:21 +0800 Subject: [PATCH 58/64] naming fix --- .buildkite/generate_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 3446d7d683f..c2570ec465e 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -241,7 +241,7 @@ def main(): if not test_file.startswith('test_'): continue test_file_path = os.path.join('tests/smoke_tests', test_file) - if "required_before_merge" in test_file: + if "test_pre_merge" in test_file: pre_merge_files.append(test_file_path) else: release_files.append(test_file_path) From 74b2d6e2cb66d35b1934f0a86c44d298198058f0 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Tue, 10 Dec 2024 12:42:53 +0800 Subject: [PATCH 59/64] grammar correction --- sky/sky_logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/sky_logging.py b/sky/sky_logging.py index 944cbcf46d4..effeab310d8 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -77,7 +77,7 @@ def _setup_logger(): # being propagated to the parent logger. _root_logger.propagate = False if env_options.Options.SUPPRESS_SENSITIVE_LOG.get(): - # If the sensitive log is enabled, we re init a new handler + # If the sensitive log is enabled, we reinitialize a new handler # and force set the level to INFO to suppress the debug logs # for certain loggers. for logger_name in _SENSITIVE_LOGGER: From 595c0431261b08e9fd077584f6123ffb3f5baab0 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 11 Dec 2024 18:17:59 +0800 Subject: [PATCH 60/64] resolve PR comment --- .buildkite/generate_pipeline.py | 16 +- tests/backward_compatibility_tests.sh | 2 + tests/smoke_tests/test_pre_merge.py | 2 +- tests/test_smoke.py | 5770 ----------------- ...merge.yaml => minimal_test_pre_merge.yaml} | 2 +- 5 files changed, 14 insertions(+), 5778 deletions(-) rename tests/test_yamls/{minimal_test_required_before_merge.yaml => minimal_test_pre_merge.yaml} (60%) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index c2570ec465e..8f1389d409a 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -24,17 +24,21 @@ import ast import os import random +import sys from typing import Any, Dict, List, Optional import yaml -DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure'] +# Add project root to Python path +tests_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'tests') +sys.path.append(tests_path) + +from conftest import all_clouds_in_smoke_tests +from conftest import default_clouds_to_run + +DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run +ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests -ALL_CLOUDS_IN_SMOKE_TESTS = [ - 'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci', - 'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod', - 'lambda_cloud' -] QUEUE_GENERIC_CLOUD = 'generic_cloud' QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve' QUEUE_KUBERNETES = 'kubernetes' diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 511b2c9ba6b..d32e1e9e224 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -36,6 +36,7 @@ cd ../sky-master git pull origin master pip uninstall -y skypilot pip install uv +uv pip install --prerelease=allow "azure-cli>=2.65.0" uv pip install -e ".[all]" cd - @@ -45,6 +46,7 @@ conda install -c conda-forge google-cloud-sdk -y rm -r ~/.sky/wheels || true pip uninstall -y skypilot pip install uv +uv pip install --prerelease=allow "azure-cli>=2.65.0" uv pip install -e ".[all]" diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py index 1d68b8a81e6..a2da638b8de 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_pre_merge.py @@ -29,7 +29,7 @@ def test_yaml_launch_and_mount(generic_cloud: str): test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', [ - f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml', + f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml', smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 2d0f7605bc4..b33c1d80bce 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -24,12 +24,6 @@ # # Change cloud for generic tests to aws # > pytest tests/test_smoke.py --generic-cloud aws -""" -# Currently copy back the tests/smoke_tests/* to tests/test_smoke.py for review. -# After review, we will remove all contents in this file and use content below. - -# All files categorized under tests/smoke_tests/* -# Please add new test cases under that directory. from smoke_tests.test_basic import * from smoke_tests.test_cluster_job import * @@ -38,5767 +32,3 @@ from smoke_tests.test_mount_and_storage import * from smoke_tests.test_region_and_zone import * from smoke_tests.test_sky_serve import * -""" -import inspect -import json -import os -import pathlib -import shlex -import shutil -import subprocess -import tempfile -import textwrap -import time -from typing import Dict, List, Optional, TextIO, Tuple -import urllib.parse -import uuid - -import jinja2 -import pytest -from smoke_tests import smoke_tests_utils - -import sky -from sky import global_user_state -from sky import jobs -from sky import serve -from sky import skypilot_config -from sky.adaptors import azure -from sky.adaptors import cloudflare -from sky.adaptors import ibm -from sky.clouds import AWS -from sky.clouds import Azure -from sky.clouds import GCP -from sky.data import data_utils -from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone -from sky.skylet import constants -from sky.skylet import events -from sky.utils import common_utils -from sky.utils import resources_utils - - -# ---------- Dry run: 2 Tasks in a chain. ---------- -@pytest.mark.no_fluidstack #requires GCP and AWS set up -def test_example_app(): - test = smoke_tests_utils.Test( - 'example_app', - ['python examples/example_app.py'], - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- A minimal task ---------- -def test_minimal(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'minimal', - [ - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', - # Output validation done. - f'sky logs {name} 1 --status', - f'sky logs {name} --status | grep "Job 1: SUCCEEDED"', # Equivalent. - # Test launch output again on existing cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - # Check the logs downloading - f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log', - # Ensure the raylet process has the correct file descriptor limit. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # Install jq for the next test. - f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', - # Check the cluster info - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', - f'sky logs {name} 5 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', - f'sky logs {name} 6 --status', # Ensure the job succeeded. - # Test '-c' for exec - f'sky exec -c {name} echo', - f'sky logs {name} 7 --status', - f'sky exec echo -c {name}', - f'sky logs {name} 8 --status', - f'sky exec -c {name} echo hi test', - f'sky logs {name} 9 | grep "hi test"', - f'sky exec {name} && exit 1 || true', - f'sky exec -c {name} && exit 1 || true', - ], - f'sky down -y {name}', - smoke_tests_utils.get_timeout(generic_cloud), - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Test fast launch ---------- -def test_launch_fast(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - - test = smoke_tests_utils.Test( - 'test_launch_fast', - [ - # First launch to create the cluster - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 1 --status', - - # Second launch to test fast launch - should not reprovision - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && ' - ' echo "$s" && ' - # Validate that cluster was not re-launched. - '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' - # Validate that setup was not re-run. - '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' - # Validate that the task ran and finished. - 'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - ], - f'sky down -y {name}', - timeout=smoke_tests_utils.get_timeout(generic_cloud), - ) - smoke_tests_utils.run_one_test(test) - - -# See cloud exclusion explanations in test_autostop -@pytest.mark.no_fluidstack -@pytest.mark.no_lambda_cloud -@pytest.mark.no_ibm -@pytest.mark.no_kubernetes -def test_launch_fast_with_autostop(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure - # the VM is stopped. - autostop_timeout = 600 if generic_cloud == 'azure' else 250 - test = smoke_tests_utils.Test( - 'test_launch_fast_with_autostop', - [ - # First launch to create the cluster with a short autostop - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 1 --status', - f'sky status -r {name} | grep UP', - - # Ensure cluster is stopped - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=autostop_timeout), - # Even the cluster is stopped, cloud platform may take a while to - # delete the VM. - f'sleep {smoke_tests_utils.BUMP_UP_SECONDS}', - # Launch again. Do full output validation - we expect the cluster to re-launch - f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - ], - f'sky down -y {name}', - timeout=smoke_tests_utils.get_timeout(generic_cloud) + autostop_timeout, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Test region ---------- -@pytest.mark.aws -def test_aws_region(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_region', - [ - f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml', - f'sky exec {name} examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_with_ssh_proxy_command(): - name = smoke_tests_utils.get_cluster_name() - - with tempfile.NamedTemporaryFile(mode='w') as f: - f.write( - textwrap.dedent(f"""\ - aws: - ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name} - """)) - f.flush() - test = smoke_tests_utils.Test( - 'aws_with_ssh_proxy_command', - [ - f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1', - # Use jump config - f'export SKYPILOT_CONFIG={f.name}; ' - f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi', - f'sky logs {name} 1 --status', - f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi', - f'sky logs {name} 2 --status', - # Start a small job to make sure the controller is created. - f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi', - # Wait other tests to create the job controller first, so that - # the job controller is not launched with proxy command. - smoke_tests_utils. - get_cmd_wait_until_cluster_status_contains_wildcard( - cluster_name_wildcard='sky-jobs-controller-*', - cluster_status=[sky.ClusterStatus.UP], - timeout=300), - f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[ - sky.ManagedJobStatus.SUCCEEDED, - sky.ManagedJobStatus.RUNNING, - sky.ManagedJobStatus.STARTING - ], - timeout=300), - ], - f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_region_and_service_account(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_region', - [ - f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml', - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_region(): - name = smoke_tests_utils.get_cluster_name() - region = 'eu-de' - test = smoke_tests_utils.Test( - 'region', - [ - f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml', - f'sky exec {name} --cloud ibm examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep {region}', # Ensure the region is correct. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_region(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'azure_region', - [ - f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml', - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep eastus2', # Ensure the region is correct. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Test zone ---------- -@pytest.mark.aws -def test_aws_zone(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_zone', - [ - f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b', - f'sky exec {name} examples/minimal.yaml --zone us-east-2b', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-east-2b', # Ensure the zone is correct. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_zone(): - name = smoke_tests_utils.get_cluster_name() - zone = 'eu-de-2' - test = smoke_tests_utils.Test( - 'zone', - [ - f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}', - f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep {zone}', # Ensure the zone is correct. - ], - f'sky down -y {name} {name}-2 {name}-3', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_zone(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_zone', - [ - f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', - f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status --all | grep {name} | grep us-central1-a', # Ensure the zone is correct. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Test the image ---------- -@pytest.mark.aws -def test_aws_images(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} examples/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_images(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_images(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'azure_images', - [ - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_image_id_dict', - [ - # Use image id dict. - f'sky launch -y -c {name} examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} "ls ~"', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_image_id_dict', - [ - # Use image id dict. - f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} "ls ~"', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict_region(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_image_id_dict_region', - [ - # YAML has - # image_id: - # us-west-2: skypilot:gpu-ubuntu-1804 - # us-east-2: skypilot:gpu-ubuntu-2004 - # Use region to filter image_id dict. - f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', - # Should success because the image id match for the region. - f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. - # Ensure exec works. - f'sky exec {name} --region us-east-2 examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict_region(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_image_id_dict_region', - [ - # Use region to filter image_id dict. - f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', - # Should success because the image id match for the region. - f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-west3', # Ensure the region is correct. - # Ensure exec works. - f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} --cloud gcp --region us-west3 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_image_id_dict_zone(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_image_id_dict_zone', - [ - # YAML has - # image_id: - # us-west-2: skypilot:gpu-ubuntu-1804 - # us-east-2: skypilot:gpu-ubuntu-2004 - # Use zone to filter image_id dict. - f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml', - # Should success because the image id match for the zone. - f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml', - # Fail due to image id mismatch. - f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-east-2a', # Ensure the zone is correct. - # Ensure exec works. - f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml', - f'sky exec {name} examples/per_region_images.yaml', - f'sky exec {name} --cloud aws --region us-east-2 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_image_id_dict_zone(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_image_id_dict_zone', - [ - # Use zone to filter image_id dict. - f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true', - f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created. - f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', - # Should success because the image id match for the zone. - f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', - f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml', - # Fail due to image id mismatch. - f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true', - f'sky logs {name} 1 --status', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 --status', - f'sky status --all | grep {name} | grep us-central1', # Ensure the zone is correct. - # Ensure exec works. - f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml', - f'sky exec {name} --cloud gcp --region us-central1 "ls ~"', - f'sky exec {name} "ls ~"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_clone_disk_aws(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'clone_disk_aws', - [ - f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', - f'sky stop {name} -y', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=60), - # Wait for EC2 instance to be in stopped state. - # TODO: event based wait. - 'sleep 60', - f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"', - f'sky logs {name}-clone 1 --status', - f'sky logs {name}-clone-2 1 --status', - ], - f'sky down -y {name} {name}-clone {name}-clone-2', - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_clone_disk_gcp(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'clone_disk_gcp', - [ - f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true', - f'sky stop {name} -y', - f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"', - f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"', - f'sky logs {name}-clone 1 --status', - f'sky logs {name}-clone-2 1 --status', - ], - f'sky down -y {name} {name}-clone {name}-clone-2', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_mig(): - name = smoke_tests_utils.get_cluster_name() - region = 'us-central1' - test = smoke_tests_utils.Test( - 'gcp_mig', - [ - f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. - # Check MIG exists. - f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"', - f'sky autostop -i 0 --down -y {name}', - smoke_tests_utils.get_cmd_wait_until_cluster_is_not_found( - cluster_name=name, timeout=120), - f'gcloud compute instance-templates list | grep "sky-it-{name}"', - # Launch again with the same region. The original instance template - # should be removed. - f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi', - f'sky logs {name} 1 | grep "L4"', - f'sky down -y {name}', - f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true', - ], - f'sky down -y {name}', - env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'}) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_force_enable_external_ips(): - name = smoke_tests_utils.get_cluster_name() - test_commands = [ - f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', - # Check network of vm is "default" - (f'gcloud compute instances list --filter=name~"{name}" --format=' - '"value(networkInterfaces.network)" | grep "networks/default"'), - # Check External NAT in network access configs, corresponds to external ip - (f'gcloud compute instances list --filter=name~"{name}" --format=' - '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"' - ), - f'sky down -y {name}', - ] - skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml' - test = smoke_tests_utils.Test('gcp_force_enable_external_ips', - test_commands, - f'sky down -y {name}', - env={'SKYPILOT_CONFIG': skypilot_config}) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_image_no_conda(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'image_no_conda', - [ - # Use image id dict. - f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml', - f'sky logs {name} 1 --status', - f'sky stop {name} -y', - f'sky start {name} -y', - f'sky exec {name} examples/per_region_images.yaml', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation -@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances -def test_custom_default_conda_env(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test('custom_default_conda_env', [ - f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky status -r {name} | grep "UP"', - f'sky logs {name} 1 --status', - f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"', - f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky logs {name} 2 --status', - f'sky autostop -y -i 0 {name}', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=80), - f'sky start -y {name}', - f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', - f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', - f'sky logs {name} 3 --status', - ], f'sky down -y {name}') - smoke_tests_utils.run_one_test(test) - - -# ------------ Test stale job ------------ -# ------------ Test stale job ------------ -@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances -@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances -def test_stale_job(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'stale_job', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - f'sky stop {name} -y', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=100), - f'sky start {name} -y', - f'sky logs {name} 1 --status', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_stale_job_manual_restart(): - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.AWS.max_cluster_name_length()) - region = 'us-east-2' - test = smoke_tests_utils.Test( - 'aws_stale_job_manual_restart', - [ - f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - # Stop the cluster manually. - f'id=`aws ec2 describe-instances --region {region} --filters ' - f'Name=tag:ray-cluster-name,Values={name_on_cloud} ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text`; ' - f'aws ec2 stop-instances --region {region} ' - '--instance-ids $id', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=40), - f'sky launch -c {name} -y "echo hi"', - f'sky logs {name} 1 --status', - f'sky logs {name} 3 --status', - # Ensure the skylet updated the stale job status. - smoke_tests_utils. - get_cmd_wait_until_job_status_contains_without_matching_job( - cluster_name=name, - job_status=[sky.JobStatus.FAILED_DRIVER], - timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS), - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_stale_job_manual_restart(): - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.GCP.max_cluster_name_length()) - zone = 'us-west2-a' - query_cmd = (f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name={name_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - stop_cmd = (f'gcloud compute instances stop --zone={zone}' - f' --quiet $({query_cmd})') - test = smoke_tests_utils.Test( - 'gcp_stale_job_manual_restart', - [ - f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"', - f'sky exec {name} -d "echo start; sleep 10000"', - # Stop the cluster manually. - stop_cmd, - 'sleep 40', - f'sky launch -c {name} -y "echo hi"', - f'sky logs {name} 1 --status', - f'sky logs {name} 3 --status', - # Ensure the skylet updated the stale job status. - smoke_tests_utils. - get_cmd_wait_until_job_status_contains_without_matching_job( - cluster_name=name, - job_status=[sky.JobStatus.FAILED_DRIVER], - timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS) - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Check Sky's environment variables; workdir. ---------- -@pytest.mark.no_fluidstack # Requires amazon S3 -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_env_check(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = smoke_tests_utils.Test( - 'env_check', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- file_mounts ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. -def test_file_mounts(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - extra_flags = '' - if generic_cloud in 'kubernetes': - # Kubernetes does not support multi-node - # NOTE: This test will fail if you have a Kubernetes cluster running on - # arm64 (e.g., Apple Silicon) since goofys does not work on arm64. - extra_flags = '--num-nodes 1' - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ] - test = smoke_tests_utils.Test( - 'using_file_mounts', - test_commands, - f'sky down -y {name}', - smoke_tests_utils.get_timeout(generic_cloud, 20 * 60), # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.scp -def test_scp_file_mounts(): - name = smoke_tests_utils.get_cluster_name() - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ] - test = smoke_tests_utils.Test( - 'SCP_using_file_mounts', - test_commands, - f'sky down -y {name}', - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Requires GCP to be enabled -def test_using_file_mounts_with_env_vars(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - storage_name = TestStorageWithCredentials.generate_bucket_name() - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} ' - 'examples/using_file_mounts_with_env_vars.yaml ' - f'--env MY_BUCKET={storage_name}'), - f'sky logs {name} 1 --status', # Ensure the job succeeded. - # Override with --env: - (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} ' - 'examples/using_file_mounts_with_env_vars.yaml ' - f'--env MY_BUCKET={storage_name} ' - '--env MY_LOCAL_PATH=tmpfile'), - f'sky logs {name}-2 1 --status', # Ensure the job succeeded. - ] - test = smoke_tests_utils.Test( - 'using_file_mounts_with_env_vars', - test_commands, - (f'sky down -y {name} {name}-2', - f'sky storage delete -y {storage_name} {storage_name}-2'), - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- storage ---------- -def _storage_mounts_commands_generator(f: TextIO, cluster_name: str, - storage_name: str, ls_hello_command: str, - cloud: str, only_mount: bool): - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render( - storage_name=storage_name, - cloud=cloud, - only_mount=only_mount, - ) - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {cluster_name} --cloud {cloud} {file_path}', - f'sky logs {cluster_name} 1 --status', # Ensure job succeeded. - ls_hello_command, - f'sky stop -y {cluster_name}', - f'sky start -y {cluster_name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {cluster_name} -- "set -ex; ls /mount_private_mount/hello.txt"', - ] - clean_command = f'sky down -y {cluster_name}; sky storage delete -y {storage_name}' - return test_commands, clean_command - - -@pytest.mark.aws -def test_aws_storage_mounts_with_stop(): - name = smoke_tests_utils.get_cluster_name() - cloud = 'aws' - storage_name = f'sky-test-{int(time.time())}' - ls_hello_command = f'aws s3 ls {storage_name}/hello.txt' - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test_commands, clean_command = _storage_mounts_commands_generator( - f, name, storage_name, ls_hello_command, cloud, False) - test = smoke_tests_utils.Test( - 'aws_storage_mounts', - test_commands, - clean_command, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -def test_aws_storage_mounts_with_stop_only_mount(): - name = smoke_tests_utils.get_cluster_name() - cloud = 'aws' - storage_name = f'sky-test-{int(time.time())}' - ls_hello_command = f'aws s3 ls {storage_name}/hello.txt' - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test_commands, clean_command = _storage_mounts_commands_generator( - f, name, storage_name, ls_hello_command, cloud, True) - test = smoke_tests_utils.Test( - 'aws_storage_mounts_only_mount', - test_commands, - clean_command, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_storage_mounts_with_stop(): - name = smoke_tests_utils.get_cluster_name() - cloud = 'gcp' - storage_name = f'sky-test-{int(time.time())}' - ls_hello_command = f'gsutil ls gs://{storage_name}/hello.txt' - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test_commands, clean_command = _storage_mounts_commands_generator( - f, name, storage_name, ls_hello_command, cloud, False) - test = smoke_tests_utils.Test( - 'gcp_storage_mounts', - test_commands, - clean_command, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_storage_mounts_with_stop(): - name = smoke_tests_utils.get_cluster_name() - cloud = 'azure' - storage_name = f'sky-test-{int(time.time())}' - default_region = 'eastus' - storage_account_name = (storage_lib.AzureBlobStore. - get_default_storage_account_name(default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - # if the file does not exist, az storage blob list returns '[]' - ls_hello_command = (f'output=$(az storage blob list -c {storage_name} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} ' - f'--prefix hello.txt) ' - f'[ "$output" = "[]" ] && exit 1 || exit 0') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test_commands, clean_command = _storage_mounts_commands_generator( - f, name, storage_name, ls_hello_command, cloud, False) - test = smoke_tests_utils.Test( - 'azure_storage_mounts', - test_commands, - clean_command, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_storage_mounts(): - # Tests bucket mounting on k8s, assuming S3 is configured. - # This test will fail if run on non x86_64 architecture, since goofys is - # built for x86_64 only. - name = smoke_tests_utils.get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - ls_hello_command = (f'aws s3 ls {storage_name}/hello.txt || ' - f'gsutil ls gs://{storage_name}/hello.txt') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test_commands, clean_command = _storage_mounts_commands_generator( - f, name, storage_name, ls_hello_command, 'kubernetes', False) - test = smoke_tests_utils.Test( - 'kubernetes_storage_mounts', - test_commands, - clean_command, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_context_switch(): - name = smoke_tests_utils.get_cluster_name() - new_context = f'sky-test-context-{int(time.time())}' - new_namespace = f'sky-test-namespace-{int(time.time())}' - - test_commands = [ - # Launch a cluster and run a simple task - f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"', - f'sky logs {name} 1 --status', # Ensure job succeeded - - # Get current context details and save to a file for later use in cleanup - 'CURRENT_CONTEXT=$(kubectl config current-context); ' - 'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; ' - 'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); ' - 'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); ' - - # Create a new context with a different name and namespace - f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}', - - # Create the new namespace if it doesn't exist - f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -', - - # Set the new context as active - f'kubectl config use-context {new_context}', - - # Verify the new context is active - f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1', - - # Try to run sky exec on the original cluster (should still work) - f'sky exec {name} "echo Success: sky exec works after context switch"', - - # Test sky queue - f'sky queue {name}', - - # Test SSH access - f'ssh {name} whoami', - ] - - cleanup_commands = ( - f'kubectl delete namespace {new_namespace}; ' - f'kubectl config delete-context {new_context}; ' - 'kubectl config use-context $(cat /tmp/sky_test_current_context); ' - 'rm /tmp/sky_test_current_context; ' - f'sky down -y {name}') - - test = smoke_tests_utils.Test( - 'kubernetes_context_switch', - test_commands, - cleanup_commands, - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - ]) -def test_docker_storage_mounts(generic_cloud: str, image_id: str): - # Tests bucket mounting on docker container - name = smoke_tests_utils.get_cluster_name() - timestamp = str(time.time()).replace('.', '') - storage_name = f'sky-test-{timestamp}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3. - azure_mount_unsupported_ubuntu_version = '18.04' - # Commands to verify bucket upload. We need to check all three - # storage types because the optimizer may pick any of them. - s3_command = f'aws s3 ls {storage_name}/hello.txt' - gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt' - azure_blob_command = TestStorageWithCredentials.cli_ls_cmd( - storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt') - if azure_mount_unsupported_ubuntu_version in image_id: - # The store for mount_private_mount is not specified in the template. - # If we're running on Azure, the private mount will be created on - # azure blob. That will not be supported on the ubuntu 18.04 image - # and thus fail. For other clouds, the private mount on other - # storage types (GCS/S3) should succeed. - include_private_mount = False if generic_cloud == 'azure' else True - content = template.render(storage_name=storage_name, - include_azure_mount=False, - include_private_mount=include_private_mount) - else: - content = template.render(storage_name=storage_name,) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - # Check AWS, GCP, or Azure storage mount. - f'{s3_command} || ' - f'{gsutil_command} || ' - f'{azure_blob_command}', - ] - test = smoke_tests_utils.Test( - 'docker_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.cloudflare -def test_cloudflare_storage_mounts(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_r2_storage_mounting.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - endpoint_url = cloudflare.create_endpoint() - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2' - ] - - test = smoke_tests_utils.Test( - 'cloudflare_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_storage_mounts(): - name = smoke_tests_utils.get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - storage_name, Rclone.RcloneClouds.IBM) - template_str = pathlib.Path( - 'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky launch -y -c {name} --cloud ibm {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt', - ] - test = smoke_tests_utils.Test( - 'ibm_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- CLI logs ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead. -def test_cli_logs(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - num_nodes = 2 - if generic_cloud == 'kubernetes': - # Kubernetes does not support multi-node - num_nodes = 1 - timestamp = time.time() - test = smoke_tests_utils.Test('cli_logs', [ - f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"', - f'sky exec {name} "echo {timestamp} 2"', - f'sky exec {name} "echo {timestamp} 3"', - f'sky exec {name} "echo {timestamp} 4"', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 4 --sync-down', - f'sky logs {name} * --sync-down', - f'sky logs {name} 1 | grep "{timestamp} 1"', - f'sky logs {name} | grep "{timestamp} 4"', - ], f'sky down -y {name}') - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.scp -def test_scp_logs(): - name = smoke_tests_utils.get_cluster_name() - timestamp = time.time() - test = smoke_tests_utils.Test( - 'SCP_cli_logs', - [ - f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} "echo {timestamp} 1"', - f'sky exec {name} "echo {timestamp} 2"', - f'sky exec {name} "echo {timestamp} 3"', - f'sky exec {name} "echo {timestamp} 4"', - f'sky logs {name} 2 --status', - f'sky logs {name} 3 4 --sync-down', - f'sky logs {name} * --sync-down', - f'sky logs {name} 1 | grep "{timestamp} 1"', - f'sky logs {name} | grep "{timestamp} 4"', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Job Queue. ---------- -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead -@pytest.mark.no_scp # SCP does not have T4 gpus. Run test_scp_job_queue instead -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. -@pytest.mark.no_oci # OCI does not have T4 gpus -def test_job_queue(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'job_queue', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 4 --status', - f'sky logs {name} 5 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Job Queue with Docker. ---------- -@pytest.mark.no_fluidstack # FluidStack does not support docker for now -@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now -@pytest.mark.no_ibm # Doesn't support IBM Cloud for now -@pytest.mark.no_paperspace # Paperspace doesn't have T4 GPUs -@pytest.mark.no_scp # Doesn't support SCP for now -@pytest.mark.no_oci # Doesn't support OCI for now -@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test latest image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - # Axolotl image is a good example custom image that has its conda path - # set in PATH with dockerfile and uses python>=3.12. It could test: - # 1. we handle the env var set in dockerfile correctly - # 2. python>=3.12 works with SkyPilot runtime. - 'docker:winglian/axolotl:main-latest' - ]) -def test_job_queue_with_docker(generic_cloud: str, image_id: str): - name = smoke_tests_utils.get_cluster_name() + image_id[len('docker:'):][:4] - total_timeout_minutes = 40 if generic_cloud == 'azure' else 15 - time_to_sleep = 300 if generic_cloud == 'azure' else 180 - test = smoke_tests_utils.Test( - 'job_queue_with_docker', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', - f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - # Make sure the GPU is still visible to the container. - f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', - f'sky logs {name} 4 --status', - f'sky stop -y {name}', - # Make sure the job status preserve after stop and start the - # cluster. This is also a test for the docker container to be - # preserved after stop and start. - f'sky start -y {name}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - # Make sure it is still visible after an stop & start cycle. - f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', - f'sky logs {name} 7 --status' - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.lambda_cloud -def test_lambda_job_queue(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'lambda_job_queue', - [ - f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_job_queue(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'ibm_job_queue', - [ - f'sky launch -y -c {name} --cloud ibm --gpus v100', - f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.scp -def test_scp_job_queue(): - name = smoke_tests_utils.get_cluster_name() - num_of_gpu_launch = 1 - num_of_gpu_exec = 0.5 - test = smoke_tests_utils.Test( - 'SCP_job_queue', - [ - f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml', - f'sky exec {name} -n {name}-1 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-2 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky exec {name} -n {name}-3 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky queue {name} | grep {name}-2 | grep RUNNING', - f'sky queue {name} | grep {name}-3 | grep PENDING', - f'sky cancel -y {name} 2', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 3', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus. -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_oci # OCI Cloud does not have T4 gpus. -@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet -def test_job_queue_multinode(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 - test = smoke_tests_utils.Test( - 'job_queue_multinode', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml', - f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', - 'sleep 90', - f'sky cancel -y {name} 1', - 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP', - f'sky cancel -y {name} 1 2 3', - f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml', - # Test the job status is correctly set to SETTING_UP, during the setup is running, - # and the job can be cancelled during the setup. - 'sleep 5', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', - f'sky cancel -y {name} 4', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs -@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs -def test_large_job_queue(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'large_job_queue', - [ - f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', - f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', - f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', - 'sleep 90', - - # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. - # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', - # Make sure the jobs are scheduled in FIFO order - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' - for i in range(1, 17) - ], - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' - for i in range(17, 33) - ], - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING' - for i in range(33, 75) - ], - f'sky cancel -y {name} 33 35 37 39 17 18 19', - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' - for i in range(33, 40, 2) - ], - 'sleep 10', - *[ - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' - for i in [34, 36, 38] - ], - ], - f'sky down -y {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs -@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs -def test_fast_large_job_queue(generic_cloud: str): - # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'fast_large_job_queue', - [ - f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}', - f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done', - 'sleep 60', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32', - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.ibm -def test_ibm_job_queue_multinode(): - name = smoke_tests_utils.get_cluster_name() - task_file = 'examples/job_queue/job_multinode_ibm.yaml' - test = smoke_tests_utils.Test( - 'ibm_job_queue_multinode', - [ - f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2', - f'sky exec {name} -n {name}-1 -d {task_file}', - f'sky exec {name} -n {name}-2 -d {task_file}', - f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)', - 'sleep 90', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', - f'sky cancel -y {name} 1', - 'sleep 5', - f'sky queue {name} | grep {name}-3 | grep RUNNING', - f'sky cancel -y {name} 1 2 3', - f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}', - # Test the job status is correctly set to SETTING_UP, during the setup is running, - # and the job can be cancelled during the setup. - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', - f'sky cancel -y {name} 4', - f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', - f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 5 --status', - f'sky logs {name} 6 --status', - f'sky logs {name} 7 --status', - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Docker with preinstalled package. ---------- -@pytest.mark.no_fluidstack # Doesn't support Fluidstack for now -@pytest.mark.no_lambda_cloud # Doesn't support Lambda Cloud for now -@pytest.mark.no_ibm # Doesn't support IBM Cloud for now -@pytest.mark.no_scp # Doesn't support SCP for now -@pytest.mark.no_oci # Doesn't support OCI for now -@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now -# TODO(zhwu): we should fix this for kubernetes -def test_docker_preinstalled_package(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'docker_with_preinstalled_package', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx', - f'sky exec {name} "nginx -V"', - f'sky logs {name} 1 --status', - f'sky exec {name} whoami | grep root', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Submitting multiple tasks to the same cluster. ---------- -@pytest.mark.no_fluidstack # FluidStack DC has low availability of T4 GPUs -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have T4 gpus -@pytest.mark.no_paperspace # Paperspace does not have T4 gpus -@pytest.mark.no_ibm # IBM Cloud does not have T4 gpus -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_oci # OCI Cloud does not have T4 gpus -def test_multi_echo(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multi_echo', - [ - f'python examples/multi_echo.py {name} {generic_cloud}', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 10', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 30', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - 'sleep 30', - # Make sure that our job scheduler is fast enough to have at least - # 10 RUNNING jobs in parallel. - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'', - 'sleep 30', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true', - f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done', - ] + - # Ensure jobs succeeded. - [ - smoke_tests_utils. - get_cmd_wait_until_job_status_contains_matching_job_id( - cluster_name=name, - job_id=i + 1, - job_status=[sky.JobStatus.SUCCEEDED], - timeout=120) for i in range(32) - ] + - # Ensure monitor/autoscaler didn't crash on the 'assert not - # unfulfilled' error. If process not found, grep->ssh returns 1. - [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], - f'sky down -y {name}', - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Task: 1 node training. ---------- -@pytest.mark.no_fluidstack # Fluidstack does not have T4 gpus for now -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. -def test_huggingface(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.lambda_cloud -def test_lambda_huggingface(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'lambda_huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.scp -def test_scp_huggingface(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - num_of_gpu_launch = 1 - test = smoke_tests_utils.Test( - 'SCP_huggingface_glue_imdb_app', - [ - f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Inferentia. ---------- -@pytest.mark.aws -def test_inferentia(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test_inferentia', - [ - f'sky launch -y -c {name} -t inf2.xlarge -- echo hi', - f'sky exec {name} --gpus Inferentia:1 echo hi', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- TPU. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'tpu_app', - [ - f'sky launch -y -c {name} examples/tpu/tpu_app.yaml', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"', # Ensure sky launch won't create another TPU. - ], - f'sky down -y {name}', - timeout=30 * 60, # can take >20 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- TPU VM. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu_vm(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'tpu_vm_app', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', # Ensure the cluster is STOPPED. - # Use retry: guard against transient errors observed for - # just-stopped TPU VMs (#962). - f'sky start --retry-until-up -y {name}', - f'sky exec {name} examples/tpu/tpuvm_mnist.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- TPU VM Pod. ---------- -@pytest.mark.gcp -@pytest.mark.tpu -def test_tpu_vm_pod(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'tpu_pod', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- TPU Pod Slice on GKE. ---------- -@pytest.mark.kubernetes -def test_tpu_pod_slice_gke(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'tpu_pod_slice_gke', - [ - f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', - f'sky logs {name} 1', # Ensure the job finished. - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. - f'sky logs {name} 2 --status' - ], - f'sky down -y {name}', - timeout=30 * 60, # can take 30 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Simple apps. ---------- -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_multi_hostname(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 - test = smoke_tests_utils.Test( - 'multi_hostname', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2', # Ensure there are 2 hosts. - f'sky exec {name} examples/multi_hostname.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=smoke_tests_utils.get_timeout(generic_cloud, - total_timeout_minutes * 60), - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_multi_node_failure(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multi_node_failure', - [ - # TODO(zhwu): we use multi-thread to run the commands in setup - # commands in parallel, which makes it impossible to fail fast - # when one of the nodes fails. We should fix this in the future. - # The --detach-setup version can fail fast, as the setup is - # submitted to the remote machine, which does not use multi-thread. - # Refer to the comment in `subprocess_utils.run_in_parallel`. - # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1', # Ensure the job setup failed. - f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml', - f'sky logs {name} 1 --status | grep FAILED_SETUP', # Ensure the job setup failed. - f'sky exec {name} tests/test_yamls/failed_worker_run.yaml', - f'sky logs {name} 2 --status | grep FAILED', # Ensure the job failed. - f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2', # Ensure there 2 of the hosts printed their hostname. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on GCP. ---------- -@pytest.mark.gcp -def test_gcp_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on AWS. ---------- -@pytest.mark.aws -def test_aws_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'aws_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on Azure. ---------- -@pytest.mark.azure -def test_azure_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'azure_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on Kubernetes. ---------- -@pytest.mark.kubernetes -def test_kubernetes_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'kubernetes_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi' - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on Paperspace. ---------- -@pytest.mark.paperspace -def test_paperspace_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'paperspace_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Web apps with custom ports on RunPod. ---------- -@pytest.mark.runpod -def test_runpod_http_server_with_custom_ports(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'runpod_http_server_with_custom_ports', - [ - f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml', - f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done', - # Retry a few times to avoid flakiness in ports being open. - f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "

This is a demo HTML page.

"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Labels from task on AWS (instance_tags) ---------- -@pytest.mark.aws -def test_task_labels_aws(): - name = smoke_tests_utils.get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='aws', region='us-east-1') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = smoke_tests_utils.Test( - 'task_labels_aws', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with aws cli that the tags are set. - 'aws ec2 describe-instances ' - '--query "Reservations[*].Instances[*].InstanceId" ' - '--filters "Name=instance-state-name,Values=running" ' - f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" ' - '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" ' - '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" ' - '--region us-east-1 --output text', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Labels from task on GCP (labels) ---------- -@pytest.mark.gcp -def test_task_labels_gcp(): - name = smoke_tests_utils.get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='gcp') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = smoke_tests_utils.Test( - 'task_labels_gcp', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with gcloud cli that the tags are set - f'gcloud compute instances list --filter="name~\'^{name}\' AND ' - 'labels.inlinelabel1=\'inlinevalue1\' AND ' - 'labels.inlinelabel2=\'inlinevalue2\'" ' - '--format="value(name)" | grep .', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Labels from task on Kubernetes (labels) ---------- -@pytest.mark.kubernetes -def test_task_labels_kubernetes(): - name = smoke_tests_utils.get_cluster_name() - template_str = pathlib.Path( - 'tests/test_yamls/test_labels.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(cloud='kubernetes') - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test = smoke_tests_utils.Test( - 'task_labels_kubernetes', - [ - f'sky launch -y -c {name} {file_path}', - # Verify with kubectl that the labels are set. - 'kubectl get pods ' - '--selector inlinelabel1=inlinevalue1 ' - '--selector inlinelabel2=inlinevalue2 ' - '-o jsonpath=\'{.items[*].metadata.name}\' | ' - f'grep \'^{name}\'' - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Pod Annotations on Kubernetes ---------- -@pytest.mark.kubernetes -def test_add_pod_annotations_for_autodown_with_launch(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'add_pod_annotations_for_autodown_with_launch', - [ - # Launch Kubernetes cluster with two nodes, each being head node and worker node. - # Autodown is set. - f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes', - # Get names of the pods containing cluster name. - f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', - f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', - # Describe the first pod and check for annotations. - 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check for annotations. - 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop' - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_add_and_remove_pod_annotations_with_autostop(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'add_and_remove_pod_annotations_with_autostop', - [ - # Launch Kubernetes cluster with two nodes, each being head node and worker node. - f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes', - # Set autodown on the cluster with 'autostop' command. - f'sky autostop -y {name} -i 20 --down', - # Get names of the pods containing cluster name. - f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)', - f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)', - # Describe the first pod and check for annotations. - 'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check for annotations. - 'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - 'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', - # Cancel the set autodown to remove the annotations from the pods. - f'sky autostop -y {name} --cancel', - # Describe the first pod and check if annotations are removed. - '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown', - '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop', - # Describe the second pod and check if annotations are removed. - '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown', - '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Container logs from task on Kubernetes ---------- -@pytest.mark.kubernetes -def test_container_logs_multinode_kubernetes(): - name = smoke_tests_utils.get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' - head_logs = ('kubectl get pods ' - f' | grep {name} | grep head | ' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - worker_logs = ('kubectl get pods ' - f' | grep {name} | grep worker |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = smoke_tests_utils.Test( - 'container_logs_multinode_kubernetes', - [ - f'sky launch -y -c {name} {task_yaml} --num-nodes 2', - f'{head_logs} | wc -l | grep 9', - f'{worker_logs} | wc -l | grep 9', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_container_logs_two_jobs_kubernetes(): - name = smoke_tests_utils.get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml' - pod_logs = ('kubectl get pods ' - f' | grep {name} | grep head |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = smoke_tests_utils.Test( - 'test_container_logs_two_jobs_kubernetes', - [ - f'sky launch -y -c {name} {task_yaml}', - f'{pod_logs} | wc -l | grep 9', - f'sky launch -y -c {name} {task_yaml}', - f'{pod_logs} | wc -l | grep 18', - f'{pod_logs} | grep 1 | wc -l | grep 2', - f'{pod_logs} | grep 2 | wc -l | grep 2', - f'{pod_logs} | grep 3 | wc -l | grep 2', - f'{pod_logs} | grep 4 | wc -l | grep 2', - f'{pod_logs} | grep 5 | wc -l | grep 2', - f'{pod_logs} | grep 6 | wc -l | grep 2', - f'{pod_logs} | grep 7 | wc -l | grep 2', - f'{pod_logs} | grep 8 | wc -l | grep 2', - f'{pod_logs} | grep 9 | wc -l | grep 2', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_container_logs_two_simultaneous_jobs_kubernetes(): - name = smoke_tests_utils.get_cluster_name() - task_yaml = 'tests/test_yamls/test_k8s_logs.yaml ' - pod_logs = ('kubectl get pods ' - f' | grep {name} | grep head |' - " awk '{print $1}' | xargs -I {} kubectl logs {}") - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - test = smoke_tests_utils.Test( - 'test_container_logs_two_simultaneous_jobs_kubernetes', - [ - f'sky launch -y -c {name}', - f'sky exec -c {name} -d {task_yaml}', - f'sky exec -c {name} -d {task_yaml}', - 'sleep 30', - f'{pod_logs} | wc -l | grep 18', - f'{pod_logs} | grep 1 | wc -l | grep 2', - f'{pod_logs} | grep 2 | wc -l | grep 2', - f'{pod_logs} | grep 3 | wc -l | grep 2', - f'{pod_logs} | grep 4 | wc -l | grep 2', - f'{pod_logs} | grep 5 | wc -l | grep 2', - f'{pod_logs} | grep 6 | wc -l | grep 2', - f'{pod_logs} | grep 7 | wc -l | grep 2', - f'{pod_logs} | grep 8 | wc -l | grep 2', - f'{pod_logs} | grep 9 | wc -l | grep 2', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Task: n=2 nodes with setups. ---------- -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.skip( - reason= - 'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.') -def test_distributed_tf(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'resnet_distributed_tf_app', - [ - # NOTE: running it twice will hang (sometimes?) - an app-level bug. - f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=25 * 60, # 25 mins (it takes around ~19 mins) - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing GCP start and stop instances ---------- -@pytest.mark.gcp -def test_gcp_start_stop(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'gcp-start-stop', - [ - f'sky launch -y -c {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. - f'sky logs {name} 3 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=40), - f'sky start -y {name} -i 1', - f'sky exec {name} examples/gcp_start_stop.yaml', - f'sky logs {name} 4 --status', # Ensure the job succeeded. - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[ - sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT - ], - timeout=200), - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing Azure start and stop instances ---------- -@pytest.mark.azure -def test_azure_start_stop(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'azure-start-stop', - [ - f'sky launch -y -c {name} examples/azure_start_stop.yaml', - f'sky exec {name} examples/azure_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', # Ensure the raylet process has the correct file descriptor limit. - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f'sky start -y {name} -i 1', - f'sky exec {name} examples/azure_start_stop.yaml', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[ - sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT - ], - timeout=280) + - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}', - ], - f'sky down -y {name}', - timeout=30 * 60, # 30 mins - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing Autostopping ---------- -@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support stopping instances -@pytest.mark.no_ibm # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -@pytest.mark.no_kubernetes # Kubernetes does not autostop yet -def test_autostop(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure - # the VM is stopped. - autostop_timeout = 600 if generic_cloud == 'azure' else 250 - # Launching and starting Azure clusters can take a long time too. e.g., restart - # a stopped Azure cluster can take 7m. So we set the total timeout to 70m. - total_timeout_minutes = 70 if generic_cloud == 'azure' else 20 - test = smoke_tests_utils.Test( - 'autostop', - [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} -i 1', - - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m"', - - # Ensure the cluster is not stopped early. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - - # Ensure the cluster is STOPPED. - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=autostop_timeout), - - # Ensure the cluster is UP and the autostop setting is reset ('-'). - f'sky start -y {name}', - f'sky status | grep {name} | grep -E "UP\s+-"', - - # Ensure the job succeeded. - f'sky exec {name} tests/test_yamls/minimal.yaml', - f'sky logs {name} 2 --status', - - # Test restarting the idleness timer via reset: - f'sky autostop -y {name} -i 1', # Idleness starts counting. - 'sleep 40', # Almost reached the threshold. - f'sky autostop -y {name} -i 1', # Should restart the timer. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=autostop_timeout), - - # Test restarting the idleness timer via exec: - f'sky start -y {name}', - f'sky status | grep {name} | grep -E "UP\s+-"', - f'sky autostop -y {name} -i 1', # Idleness starts counting. - 'sleep 45', # Almost reached the threshold. - f'sky exec {name} echo hi', # Should restart the timer. - 'sleep 45', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=autostop_timeout + smoke_tests_utils.BUMP_UP_SECONDS), - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing Autodowning ---------- -@pytest.mark.no_fluidstack # FluidStack does not support stopping in SkyPilot implementation -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead. -def test_autodown(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure - # the VM is terminated. - autodown_timeout = 900 if generic_cloud == 'azure' else 240 - total_timeout_minutes = 90 if generic_cloud == 'azure' else 20 - test = smoke_tests_utils.Test( - 'autodown', - [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --down -i 1', - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m (down)"', - # Ensure the cluster is not terminated early. - 'sleep 40', - f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - # Ensure the cluster is terminated. - f'sleep {autodown_timeout}', - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep "1m (down)"', - f'sleep {autodown_timeout}', - # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --cancel', - f'sleep {autodown_timeout}', - # Ensure the cluster is still UP. - f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP', - ], - f'sky down -y {name}', - timeout=total_timeout_minutes * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.scp -def test_scp_autodown(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'SCP_autodown', - [ - f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --down -i 1', - # Ensure autostop is set. - f'sky status | grep {name} | grep "1m (down)"', - # Ensure the cluster is not terminated early. - 'sleep 45', - f'sky status --refresh | grep {name} | grep UP', - # Ensure the cluster is terminated. - 'sleep 200', - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml', - f'sky status | grep {name} | grep "1m (down)"', - 'sleep 200', - # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml', - f'sky autostop -y {name} --cancel', - 'sleep 200', - # Ensure the cluster is still UP. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP', - ], - f'sky down -y {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): - test = smoke_tests_utils.Test( - f'{cloud}-cancel-task', - [ - f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d', - # Wait the GPU process to start. - 'sleep 60', - f'sky exec {name} "nvidia-smi | grep python"', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky cancel -y {name} 1', - 'sleep 60', - # check if the python job is gone. - f'sky exec {name} "! nvidia-smi | grep python"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=timeout, - ) - return test - - -# ---------- Testing `sky cancel` ---------- -@pytest.mark.aws -def test_cancel_aws(): - name = smoke_tests_utils.get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'aws') - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_cancel_gcp(): - name = smoke_tests_utils.get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'gcp') - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_cancel_azure(): - name = smoke_tests_utils.get_cluster_name() - test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now -@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus -@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA -@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi -@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_cancel_pytorch(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'cancel-pytorch', - [ - f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d', - # Wait the GPU process to start. - 'sleep 90', - f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || ' - # When run inside container/k8s, nvidia-smi cannot show process ids. - # See https://github.com/NVIDIA/nvidia-docker/issues/179 - # To work around, we check if GPU utilization is greater than 0. - f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - f'sky cancel -y {name} 1', - 'sleep 60', - f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || ' - # Ensure Xorg is the only process running. - '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi` -# requires a CUDA public image, which IBM doesn't offer -@pytest.mark.ibm -def test_cancel_ibm(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'ibm-cancel-task', - [ - f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml', - f'sky exec {name} -n {name}-1 -d "while true; do echo \'Hello SkyPilot\'; sleep 2; done"', - 'sleep 20', - f'sky queue {name} | grep {name}-1 | grep RUNNING', - f'sky cancel -y {name} 2', - f'sleep 5', - f'sky queue {name} | grep {name}-1 | grep CANCELLED', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing use-spot option ---------- -@pytest.mark.no_fluidstack # FluidStack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -def test_use_spot(generic_cloud: str): - """Test use-spot and sky exec.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'use-spot', - [ - f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y', - f'sky logs {name} 1 --status', - f'sky exec {name} echo hi', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_stop_gcp_spot(): - """Test GCP spot can be stopped, autostopped, restarted.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'stop_gcp_spot', - [ - f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', - # stop should go through: - f'sky stop {name} -y', - f'sky start {name} -y', - f'sky exec {name} -- ls myfile', - f'sky logs {name} 2 --status', - f'sky autostop {name} -i0 -y', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=90), - f'sky start {name} -y', - f'sky exec {name} -- ls myfile', - f'sky logs {name} 3 --status', - # -i option at launch should go through: - f'sky launch -c {name} -i0 -y', - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=120), - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing managed job ---------- -# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues -# when the controller being on Azure, which takes a long time for launching -# step. -@pytest.mark.managed_jobs -def test_managed_jobs(generic_cloud: str): - """Test the managed jobs yaml.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'managed-jobs', - [ - f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-1', - job_status=[ - sky.ManagedJobStatus.PENDING, - sky.ManagedJobStatus.SUBMITTED, - sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING - ], - timeout=60), - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-2', - job_status=[ - sky.ManagedJobStatus.PENDING, - sky.ManagedJobStatus.SUBMITTED, - sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING - ], - timeout=60), - f'sky jobs cancel -y -n {name}-1', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-1', - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=230), - # Test the functionality for logging. - f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', - f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', - ], - # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when - # canceling multiple job names is supported. - f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_job_pipeline(generic_cloud: str): - """Test a job pipeline.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'spot-pipeline', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d', - 'sleep 5', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', - # `grep -A 4 {name}` finds the job with {name} and the 4 lines - # after it, i.e. the 4 tasks within the job. - # `sed -n 2p` gets the second line of the 4 lines, i.e. the first - # task within the job. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', - f'sky jobs cancel -y -n {name}', - 'sleep 5', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', - 'sleep 200', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_failed_setup(generic_cloud: str): - """Test managed job with failed setup.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'managed_jobs_failed_setup', - [ - f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', - # Make sure the job failed quickly. - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.FAILED_SETUP], - timeout=330 + smoke_tests_utils.BUMP_UP_SECONDS), - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack #fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): - """Test managed job with failed setup for a pipeline.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'managed_jobs_pipeline_failed_setup', - [ - f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.FAILED_SETUP], - timeout=600), - # Make sure the job failed quickly. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', - # Task 0 should be SUCCEEDED. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', - # Task 1 should be FAILED_SETUP. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', - # Task 2 should be CANCELLED. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - # Task 3 should be CANCELLED. - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing managed job recovery ---------- - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_aws(aws_config_region): - """Test managed job recovery.""" - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = smoke_tests_utils.Test( - 'managed_jobs_recovery_aws', - [ - f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=600), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the cluster manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_gcp(): - """Test managed job recovery.""" - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-east4-b' - query_cmd = ( - f'gcloud compute instances list --filter=' - # `:` means prefix match. - f'"(labels.ray-cluster-name:{name_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = smoke_tests_utils.Test( - 'managed_jobs_recovery_gcp', - [ - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=300), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the cluster manually. - terminate_cmd, - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_recovery_aws(aws_config_region): - """Test managed job recovery for a pipeline.""" - name = smoke_tests_utils.get_cluster_name() - user_hash = common_utils.get_user_hash() - user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] - region = aws_config_region - if region != 'us-east-2': - pytest.skip('Only run spot pipeline recovery test in us-east-2') - test = smoke_tests_utils.Test( - 'managed_jobs_pipeline_recovery_aws', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', - # Terminate the cluster manually. - # The `cat ...| rev` is to retrieve the job_id from the - # SKYPILOT_TASK_ID, which gets the second to last field - # separated by `-`. - ( - f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - 'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;' - f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - # TODO(zhwu): fix the name for spot cluster. - '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}' - f'-{user_hash} ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', - f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', - f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_pipeline_recovery_gcp(): - """Test managed job recovery for a pipeline.""" - name = smoke_tests_utils.get_cluster_name() - zone = 'us-east4-b' - user_hash = common_utils.get_user_hash() - user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME] - query_cmd = ( - 'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = smoke_tests_utils.Test( - 'managed_jobs_pipeline_recovery_gcp', - [ - f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', - # Terminate the cluster manually. - # The `cat ...| rev` is to retrieve the job_id from the - # SKYPILOT_TASK_ID, which gets the second to last field - # separated by `-`. - (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' - f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=200), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', - f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', - f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', - f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_default_resources(generic_cloud: str): - """Test managed job recovery for default resources.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'managed-spot-recovery-default-resources', - [ - f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[ - sky.ManagedJobStatus.RUNNING, - sky.ManagedJobStatus.RECOVERING - ], - timeout=360), - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_multi_node_aws(aws_config_region): - """Test managed job recovery.""" - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = smoke_tests_utils.Test( - 'managed_jobs_recovery_multi_node_aws', - [ - f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=450), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the worker manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' - 'Name=tag:ray-node-type,Values=worker ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=560), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_recovery_multi_node_gcp(): - """Test managed job recovery.""" - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-west2-a' - # Use ':' to match as the cluster name will contain the suffix with job id - query_cmd = ( - f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:{name_on_cloud} AND ' - f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = smoke_tests_utils.Test( - 'managed_jobs_recovery_multi_node_gcp', - [ - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=400), - f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', - # Terminate the worker manually. - terminate_cmd, - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=560), - f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', - ], - f'sky jobs cancel -y -n {name}', - timeout=25 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.managed_jobs -def test_managed_jobs_cancellation_aws(aws_config_region): - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - name_2_on_cloud = common_utils.make_cluster_name_on_cloud( - f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - name_3_on_cloud = common_utils.make_cluster_name_on_cloud( - f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - region = aws_config_region - test = smoke_tests_utils.Test( - 'managed_jobs_cancellation_aws', - [ - # Test cancellation during spot cluster being launched. - f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[ - sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING - ], - timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' - ), - # Test cancelling the spot cluster during spot job being setup. - f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - # The job is set up in the cluster, will shown as RUNNING. - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-2', - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}-2', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-2', - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]' - ), - # Test cancellation during spot job is recovering. - f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', - # The job is running in the cluster, will shown as RUNNING. - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-3', - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), - # Terminate the cluster manually. - (f'aws ec2 terminate-instances --region {region} --instance-ids $(' - f'aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' - f'--query Reservations[].Instances[].InstanceId ' - '--output text)'), - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', - f'sky jobs cancel -y -n {name}-3', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-3', - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because - # there can be multiple VM with the same name due to the recovery. - (f's=$(aws ec2 describe-instances --region {region} ' - f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' - f'--query Reservations[].Instances[].State[].Name ' - '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"' - ), - ], - timeout=25 * 60) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.managed_jobs -def test_managed_jobs_cancellation_gcp(): - name = smoke_tests_utils.get_cluster_name() - name_3 = f'{name}-3' - name_3_on_cloud = common_utils.make_cluster_name_on_cloud( - name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False) - zone = 'us-west3-b' - query_state_cmd = ( - 'gcloud compute instances list ' - f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" ' - '--format="value(status)"') - query_cmd = (f'gcloud compute instances list --filter=' - f'"(labels.ray-cluster-name:{name_3_on_cloud})" ' - f'--zones={zone} --format="value(name)"') - terminate_cmd = (f'gcloud compute instances delete --zone={zone}' - f' --quiet $({query_cmd})') - test = smoke_tests_utils.Test( - 'managed_jobs_cancellation_gcp', - [ - # Test cancellation during spot cluster being launched. - f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.STARTING], - timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - # Test cancelling the spot cluster during spot job being setup. - f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', - # The job is set up in the cluster, will shown as RUNNING. - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-2', - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), - f'sky jobs cancel -y -n {name}-2', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-2', - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - # Test cancellation during spot job is recovering. - f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-3', - job_status=[sky.ManagedJobStatus.RUNNING], - timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS), - # Terminate the cluster manually. - terminate_cmd, - smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), - f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', - f'sky jobs cancel -y -n {name}-3', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=f'{name}-3', - job_status=[sky.ManagedJobStatus.CANCELLED], - timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS), - # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because - # there can be multiple VM with the same name due to the recovery. - (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' - ), - ], - timeout=25 * 60) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing storage for managed job ---------- -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances -@pytest.mark.no_ibm # IBM Cloud does not support spot instances -@pytest.mark.no_paperspace # Paperspace does not support spot instances -@pytest.mark.no_scp # SCP does not support spot instances -@pytest.mark.managed_jobs -def test_managed_jobs_storage(generic_cloud: str): - """Test storage with managed job""" - name = smoke_tests_utils.get_cluster_name() - yaml_str = pathlib.Path( - 'examples/managed_job_with_storage.yaml').read_text() - timestamp = int(time.time()) - storage_name = f'sky-test-{timestamp}' - output_storage_name = f'sky-test-output-{timestamp}' - - # Also perform region testing for bucket creation to validate if buckets are - # created in the correct region and correctly mounted in managed jobs. - # However, we inject this testing only for AWS and GCP since they are the - # supported object storage providers in SkyPilot. - region_flag = '' - region_validation_cmd = 'true' - use_spot = ' --use-spot' - if generic_cloud == 'aws': - region = 'eu-central-1' - region_flag = f' --region {region}' - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.S3, bucket_name=storage_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.S3, output_storage_name, 'output.txt') - output_check_cmd = f'{s3_check_file_count} | grep 1' - elif generic_cloud == 'gcp': - region = 'us-west2' - region_flag = f' --region {region}' - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.GCS, bucket_name=storage_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.GCS, output_storage_name, 'output.txt') - output_check_cmd = f'{gcs_check_file_count} | grep 1' - elif generic_cloud == 'azure': - region = 'westus2' - region_flag = f' --region {region}' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name(region)) - region_cmd = TestStorageWithCredentials.cli_region_cmd( - storage_lib.StoreType.AZURE, - storage_account_name=storage_account_name) - region_validation_cmd = f'{region_cmd} | grep {region}' - az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.AZURE, - output_storage_name, - 'output.txt', - storage_account_name=storage_account_name) - output_check_cmd = f'{az_check_file_count} | grep 1' - elif generic_cloud == 'kubernetes': - # With Kubernetes, we don't know which object storage provider is used. - # Check both S3 and GCS if bucket exists in either. - s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.S3, output_storage_name, 'output.txt') - s3_output_check_cmd = f'{s3_check_file_count} | grep 1' - gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket( - storage_lib.StoreType.GCS, output_storage_name, 'output.txt') - gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1' - output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}' - use_spot = ' --no-use-spot' - - yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name) - yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(yaml_str) - f.flush() - file_path = f.name - test = smoke_tests_utils.Test( - 'managed_jobs_storage', - [ - *smoke_tests_utils.STORAGE_SETUP_COMMANDS, - f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', - region_validation_cmd, # Check if the bucket is created in the correct region - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.SUCCEEDED], - timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), - # Wait for the job to be cleaned up. - 'sleep 20', - f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', - # Check if file was written to the mounted output bucket - output_check_cmd - ], - (f'sky jobs cancel -y -n {name}', - f'; sky storage delete {output_storage_name} || true'), - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing spot TPU ---------- -@pytest.mark.gcp -@pytest.mark.managed_jobs -@pytest.mark.tpu -def test_managed_jobs_tpu(): - """Test managed job on TPU.""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-spot-tpu', - [ - f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.STARTING], - timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS), - # TPU takes a while to launch - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[ - sky.ManagedJobStatus.RUNNING, sky.ManagedJobStatus.SUCCEEDED - ], - timeout=900 + smoke_tests_utils.BUMP_UP_SECONDS), - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing env for managed jobs ---------- -@pytest.mark.managed_jobs -def test_managed_jobs_inline_env(generic_cloud: str): - """Test managed jobs env""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-managed-jobs-inline-env', - [ - f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\\$TEST_ENV"; ([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - smoke_tests_utils. - get_cmd_wait_until_managed_job_status_contains_matching_job_name( - job_name=name, - job_status=[sky.ManagedJobStatus.SUCCEEDED], - timeout=20 + smoke_tests_utils.BUMP_UP_SECONDS), - f'JOB_ROW=$(sky jobs queue | grep {name} | head -n1) && ' - f'echo "$JOB_ROW" && echo "$JOB_ROW" | grep "SUCCEEDED" && ' - f'JOB_ID=$(echo "$JOB_ROW" | awk \'{{print $1}}\') && ' - f'echo "JOB_ID=$JOB_ID" && ' - # Test that logs are still available after the job finishes. - 'unset SKYPILOT_DEBUG; s=$(sky jobs logs $JOB_ID --refresh) && echo "$s" && echo "$s" | grep "hello world" && ' - # Make sure we skip the unnecessary logs. - 'echo "$s" | head -n1 | grep "Waiting for"', - ], - f'sky jobs cancel -y -n {name}', - # Increase timeout since sky jobs queue -r can be blocked by other spot tests. - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing env ---------- -def test_inline_env(generic_cloud: str): - """Test env""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-inline-env', - [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - 'sleep 20', - f'sky logs {name} 1 --status', - f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - smoke_tests_utils.get_timeout(generic_cloud), - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing env file ---------- -def test_inline_env_file(generic_cloud: str): - """Test env""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-inline-env-file', - [ - f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 1 --status', - f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', - f'sky logs {name} 2 --status', - ], - f'sky down -y {name}', - smoke_tests_utils.get_timeout(generic_cloud), - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing custom image ---------- -@pytest.mark.aws -def test_aws_custom_image(): - """Test AWS custom image""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-aws-custom-image', - [ - f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a', # Nvidia image - f'sky logs {name} 1 --status', - ], - f'sky down -y {name}', - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -@pytest.mark.parametrize( - 'image_id', - [ - 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', - 'docker:ubuntu:18.04', - # Test latest image with python 3.11 installed by default. - 'docker:continuumio/miniconda3:24.1.2-0', - # Test python>=3.12 where SkyPilot should automatically create a separate - # conda env for runtime with python 3.10. - 'docker:continuumio/miniconda3:latest', - ]) -def test_kubernetes_custom_image(image_id): - """Test Kubernetes custom image""" - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'test-kubernetes-custom-image', - [ - f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1', - f'sky logs {name} 1 --status', - # Try exec to run again and check if the logs are printed - f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"', - # Make sure ssh is working with custom username - f'ssh {name} echo hi | grep hi', - ], - f'sky down -y {name}', - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_start_stop_two_nodes(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'azure-start-stop-two-nodes', - [ - f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml', - f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky stop -y {name}', - f'sky start -y {name} -i 1', - f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml', - f'sky logs {name} 2 --status', # Ensure the job succeeded. - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[ - sky.ClusterStatus.INIT, sky.ClusterStatus.STOPPED - ], - timeout=200 + smoke_tests_utils.BUMP_UP_SECONDS) + - f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}' - ], - f'sky down -y {name}', - timeout=30 * 60, # 30 mins (it takes around ~23 mins) - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing env for disk tier ---------- -@pytest.mark.aws -def test_aws_disk_tier(): - - def _get_aws_query_command(region, instance_id, field, expected): - return (f'aws ec2 describe-volumes --region {region} ' - f'--filters Name=attachment.instance-id,Values={instance_id} ' - f'--query Volumes[*].{field} | grep {expected} ; ') - - for disk_tier in list(resources_utils.DiskTier): - specs = AWS._get_disk_specs(disk_tier) - name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.AWS.max_cluster_name_length()) - region = 'us-east-2' - test = smoke_tests_utils.Test( - 'aws-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud aws --region {region} ' - f'--disk-tier {disk_tier.value} echo "hello sky"', - f'id=`aws ec2 describe-instances --region {region} --filters ' - f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query ' - f'Reservations[].Instances[].InstanceId --output text`; ' + - _get_aws_query_command(region, '$id', 'VolumeType', - specs['disk_tier']) + - ('' if specs['disk_tier'] - == 'standard' else _get_aws_query_command( - region, '$id', 'Iops', specs['disk_iops'])) + - ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command( - region, '$id', 'Throughput', specs['disk_throughput'])), - ], - f'sky down -y {name}', - timeout=10 * 60, # 10 mins (it takes around ~6 mins) - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_disk_tier(): - for disk_tier in list(resources_utils.DiskTier): - disk_types = [GCP._get_disk_type(disk_tier)] - name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.GCP.max_cluster_name_length()) - region = 'us-west2' - instance_type_options = [''] - if disk_tier == resources_utils.DiskTier.BEST: - # Ultra disk tier requires n2 instance types to have more than 64 CPUs. - # If using default instance type, it will only enable the high disk tier. - disk_types = [ - GCP._get_disk_type(resources_utils.DiskTier.HIGH), - GCP._get_disk_type(resources_utils.DiskTier.ULTRA), - ] - instance_type_options = ['', '--instance-type n2-standard-64'] - for disk_type, instance_type_option in zip(disk_types, - instance_type_options): - test = smoke_tests_utils.Test( - 'gcp-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud gcp --region {region} ' - f'--disk-tier {disk_tier.value} {instance_type_option} ', - f'name=`gcloud compute instances list --filter=' - f'"labels.ray-cluster-name:{name_on_cloud}" ' - '--format="value(name)"`; ' - f'gcloud compute disks list --filter="name=$name" ' - f'--format="value(type)" | grep {disk_type} ' - ], - f'sky down -y {name}', - timeout=6 * 60, # 6 mins (it takes around ~3 mins) - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_disk_tier(): - for disk_tier in list(resources_utils.DiskTier): - if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: - # Azure does not support high and ultra disk tier. - continue - type = Azure._get_disk_type(disk_tier) - name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.Azure.max_cluster_name_length()) - region = 'westus2' - test = smoke_tests_utils.Test( - 'azure-disk-tier-' + disk_tier.value, - [ - f'sky launch -y -c {name} --cloud azure --region {region} ' - f'--disk-tier {disk_tier.value} echo "hello sky"', - f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' - f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' - f'--output tsv | grep {type}' - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins (it takes around ~12 mins) - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -def test_azure_best_tier_failover(): - type = Azure._get_disk_type(resources_utils.DiskTier.LOW) - name = smoke_tests_utils.get_cluster_name() - name_on_cloud = common_utils.make_cluster_name_on_cloud( - name, sky.Azure.max_cluster_name_length()) - region = 'westus2' - test = smoke_tests_utils.Test( - 'azure-best-tier-failover', - [ - f'sky launch -y -c {name} --cloud azure --region {region} ' - f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"', - f'az resource list --tag ray-cluster-name={name_on_cloud} --query ' - f'"[?type==\'Microsoft.Compute/disks\'].sku.name" ' - f'--output tsv | grep {type}', - ], - f'sky down -y {name}', - timeout=20 * 60, # 20 mins (it takes around ~12 mins) - ) - smoke_tests_utils.run_one_test(test) - - -# ------ Testing Zero Quota Failover ------ -@pytest.mark.aws -def test_aws_zero_quota_failover(): - - name = smoke_tests_utils.get_cluster_name() - region = smoke_tests_utils.get_aws_region_for_quota_failover() - - if not region: - pytest.xfail( - 'Unable to test zero quota failover optimization — quotas ' - 'for EC2 P3 instances were found on all AWS regions. Is this ' - 'expected for your account?') - return - - test = smoke_tests_utils.Test( - 'aws-zero-quota-failover', - [ - f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_zero_quota_failover(): - - name = smoke_tests_utils.get_cluster_name() - region = smoke_tests_utils.get_gcp_region_for_quota_failover() - - if not region: - pytest.xfail( - 'Unable to test zero quota failover optimization — quotas ' - 'for A100-80GB GPUs were found on all GCP regions. Is this ' - 'expected for your account?') - return - - test = smoke_tests_utils.Test( - 'gcp-zero-quota-failover', - [ - f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -def test_long_setup_run_script(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - with tempfile.NamedTemporaryFile('w', prefix='sky_app_', - suffix='.yaml') as f: - f.write( - textwrap.dedent(""" \ - setup: | - echo "start long setup" - """)) - for i in range(1024 * 200): - f.write(f' echo {i}\n') - f.write(' echo "end long setup"\n') - f.write( - textwrap.dedent(""" \ - run: | - echo "run" - """)) - for i in range(1024 * 200): - f.write(f' echo {i}\n') - f.write(' echo "end run"\n') - f.flush() - - test = smoke_tests_utils.Test( - 'long-setup-run-script', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}', - f'sky exec {name} "echo hello"', - f'sky exec {name} {f.name}', - f'sky logs {name} --status 1', - f'sky logs {name} --status 2', - f'sky logs {name} --status 3', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Testing skyserve ---------- - - -def _get_service_name() -> str: - """Returns a user-unique service name for each test_skyserve_(). - - Must be called from each test_skyserve_(). - """ - caller_func_name = inspect.stack()[1][3] - test_name = caller_func_name.replace('_', '-').replace('test-', 't-') - test_name = test_name.replace('skyserve-', 'ss-') - test_name = common_utils.make_cluster_name_on_cloud(test_name, 24) - return f'{test_name}-{smoke_tests_utils.test_id}' - - -# We check the output of the skyserve service to see if it is ready. Output of -# `REPLICAS` is in the form of `1/2` where the first number is the number of -# ready replicas and the second number is the number of total replicas. We -# grep such format to ensure that the service is ready, and early exit if any -# failure detected. In the end we sleep for -# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have -# enough time to sync with the controller and get all ready replica IPs. -_SERVE_WAIT_UNTIL_READY = ( - '{{ while true; do' - ' s=$(sky serve status {name}); echo "$s";' - ' echo "$s" | grep -q "{replica_num}/{replica_num}" && break;' - ' echo "$s" | grep -q "FAILED" && exit 1;' - ' sleep 10;' - ' done; }}; echo "Got service status $s";' - f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};') -_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' -_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' -_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING' -# Since we don't allow terminate the service if the controller is INIT, -# which is common for simultaneous pytest, we need to wait until the -# controller is UP before we can terminate the service. -# The teardown command has a 10-mins timeout, so we don't need to do -# the timeout here. See implementation of run_one_test() for details. -_TEARDOWN_SERVICE = ( - '(for i in `seq 1 20`; do' - ' s=$(sky serve down -y {name});' - ' echo "Trying to terminate {name}";' - ' echo "$s";' - ' echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;' - ' sleep 10;' - ' [ $i -eq 20 ] && echo "Failed to terminate service {name}";' - 'done)') - -_SERVE_ENDPOINT_WAIT = ( - 'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' - 'endpoint=$(sky serve status --endpoint {name}); ' - 'until ! echo "$endpoint" | grep "Controller is initializing"; ' - 'do echo "Waiting for serve endpoint to be ready..."; ' - 'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; ' - 'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"') - -_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); ' - 'until ! echo "$s" | grep "Controller is initializing."; ' - 'do echo "Waiting for serve status to be ready..."; ' - 'sleep 5; s=$(sky serve status {name}); done; echo "$s"') - - -def _get_replica_ip(name: str, replica_id: int) -> str: - return (f'ip{replica_id}=$(echo "$s" | ' - f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | ' - f'grep -E "{name}\s+{replica_id}" | ' - f'grep -Eo "{_IP_REGEX}")') - - -def _get_skyserve_http_test(name: str, cloud: str, - timeout_minutes: int) -> smoke_tests_utils.Test: - test = smoke_tests_utils.Test( - f'test-skyserve-{cloud.replace("_", "-")}', - [ - f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=timeout_minutes * 60, - ) - return test - - -def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool, - str]]) -> str: - """Check replicas' status and count in sky serve status - - We will check vCPU=2, as all our tests use vCPU=2. - - Args: - name: the name of the service - check_tuples: A list of replica property to check. Each tuple is - (count, is_spot, status) - """ - check_cmd = '' - for check_tuple in check_tuples: - count, is_spot, status = check_tuple - resource_str = '' - if status not in ['PENDING', 'SHUTTING_DOWN' - ] and not status.startswith('FAILED'): - spot_str = '' - if is_spot: - spot_str = '\[Spot\]' - resource_str = f'({spot_str}vCPU=2)' - check_cmd += (f' echo "$s" | grep "{resource_str}" | ' - f'grep "{status}" | wc -l | grep {count} || exit 1;') - return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd) - - -def _check_service_version(service_name: str, version: str) -> str: - # Grep the lines before 'Service Replicas' and check if the service version - # is correct. - return (f'echo "$s" | grep -B1000 "Service Replicas" | ' - f'grep -E "{service_name}\s+{version}" || exit 1; ') - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_gcp_http(): - """Test skyserve on GCP""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'gcp', 20) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.aws -@pytest.mark.serve -def test_skyserve_aws_http(): - """Test skyserve on AWS""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'aws', 20) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.azure -@pytest.mark.serve -def test_skyserve_azure_http(): - """Test skyserve on Azure""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'azure', 30) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -@pytest.mark.serve -def test_skyserve_kubernetes_http(): - """Test skyserve on Kubernetes""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'kubernetes', 30) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.oci -@pytest.mark.serve -def test_skyserve_oci_http(): - """Test skyserve on OCI""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'oci', 20) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now -@pytest.mark.serve -def test_skyserve_llm(generic_cloud: str): - """Test skyserve with real LLM usecase""" - name = _get_service_name() - - def generate_llm_test_command(prompt: str, expected_output: str) -> str: - prompt = shlex.quote(prompt) - expected_output = shlex.quote(expected_output) - return ( - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'python tests/skyserve/llm/get_response.py --endpoint $endpoint ' - f'--prompt {prompt} | grep {expected_output}') - - with open('tests/skyserve/llm/prompt_output.json', 'r', - encoding='utf-8') as f: - prompt2output = json.load(f) - - test = smoke_tests_utils.Test( - f'test-skyserve-llm', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - *[ - generate_llm_test_command(prompt, output) - for prompt, output in prompt2output.items() - ], - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=40 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_spot_recovery(): - name = _get_service_name() - zone = 'us-central1-a' - - test = smoke_tests_utils.Test( - f'test-skyserve-spot-recovery-gcp', - [ - f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - smoke_tests_utils.terminate_gcp_replica(name, zone, 1), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack does not support spot instances -@pytest.mark.serve -@pytest.mark.no_kubernetes -def test_skyserve_base_ondemand_fallback(generic_cloud: str): - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-base-ondemand-fallback', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(1, True, 'READY'), - (1, False, 'READY')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -def test_skyserve_dynamic_ondemand_fallback(): - name = _get_service_name() - zone = 'us-central1-a' - - test = smoke_tests_utils.Test( - f'test-skyserve-dynamic-ondemand-fallback', - [ - f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml', - f'sleep 40', - # 2 on-demand (provisioning) + 2 Spot (provisioning). - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";' - 'echo "$s" | grep -q "0/4" || exit 1', - # Wait for the provisioning starts - f'sleep 40', - _check_replica_in_status(name, [ - (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), - (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN') - ]), - - # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(2, True, 'READY'), - (0, False, '')]), - smoke_tests_utils.terminate_gcp_replica(name, zone, 1), - f'sleep 40', - # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning). - f'{_SERVE_STATUS_WAIT.format(name=name)}; ' - 'echo "$s" | grep -q "1/3"', - _check_replica_in_status( - name, [(1, True, 'READY'), - (1, True, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), - - # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - _check_replica_in_status(name, [(2, True, 'READY'), - (0, False, '')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_user_bug_restart(generic_cloud: str): - """Tests that we restart the service after user bug.""" - # TODO(zhwu): this behavior needs some rethinking. - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-user-bug-restart', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml', - f's=$(sky serve status {name}); echo "$s";' - 'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; ' - 'do echo "Waiting for first service to be SHUTTING DOWN..."; ' - f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ', - f's=$(sky serve status {name}); echo "$s";' - 'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; ' - 'do echo "Waiting for first service to be FAILED..."; ' - f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; ' - + _check_replica_in_status(name, [(1, True, 'FAILED')]) + - # User bug failure will cause no further scaling. - f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; ' - f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; ' - + _check_replica_in_status(name, [(1, False, 'READY'), - (1, False, 'FAILED')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -@pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP -def test_skyserve_load_balancer(generic_cloud: str): - """Test skyserve load balancer round-robin policy""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-load-balancer', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - f'{_SERVE_STATUS_WAIT.format(name=name)}; ' - f'{_get_replica_ip(name, 1)}; ' - f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' - 'python tests/skyserve/load_balancer/test_round_robin.py ' - '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.gcp -@pytest.mark.serve -@pytest.mark.no_kubernetes -def test_skyserve_auto_restart(): - """Test skyserve with auto restart""" - name = _get_service_name() - zone = 'us-central1-a' - test = smoke_tests_utils.Test( - f'test-skyserve-auto-restart', - [ - # TODO(tian): we can dynamically generate YAML from template to - # avoid maintaining too many YAML files - f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - # sleep for 20 seconds (initial delay) to make sure it will - # be restarted - f'sleep 20', - smoke_tests_utils.terminate_gcp_replica(name, zone, 1), - # Wait for consecutive failure timeout passed. - # If the cluster is not using spot, it won't check the cluster status - # on the cloud (since manual shutdown is not a common behavior and such - # queries takes a lot of time). Instead, we think continuous 3 min probe - # failure is not a temporary problem but indeed a failure. - 'sleep 180', - # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time - # that the output of `sky serve status` shows FAILED and this status will - # cause _SERVE_WAIT_UNTIL_READY to early quit. - '(while true; do' - f' output=$(sky serve status {name});' - ' echo "$output" | grep -q "1/1" && break;' - ' sleep 10;' - f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_cancel(generic_cloud: str): - """Test skyserve with cancel""" - name = _get_service_name() - - test = smoke_tests_utils.Test( - f'test-skyserve-cancel', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 ' - 'tests/skyserve/cancel/send_cancel_request.py ' - '--endpoint $endpoint | grep "Request was cancelled"', - f's=$(sky serve logs {name} 1 --no-follow); ' - 'until ! echo "$s" | grep "Please wait for the controller to be"; ' - 'do echo "Waiting for serve logs"; sleep 10; ' - f's=$(sky serve logs {name} 1 --no-follow); done; ' - 'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_streaming(generic_cloud: str): - """Test skyserve with streaming""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-streaming', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'python3 tests/skyserve/streaming/send_streaming_request.py ' - '--endpoint $endpoint | grep "Streaming test passed"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_readiness_timeout_fail(generic_cloud: str): - """Test skyserve with large readiness probe latency, expected to fail""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-readiness-timeout-fail', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml', - # None of the readiness probe will pass, so the service will be - # terminated after the initial delay. - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - 'sleep 60', - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;' - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_large_readiness_timeout(generic_cloud: str): - """Test skyserve with customized large readiness timeout""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-large-readiness-timeout', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_update(generic_cloud: str): - """Test skyserve with update""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-update', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml', - # sleep before update is registered. - 'sleep 20', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;' - # Make sure the traffic is not mixed - 'curl http://$endpoint | grep "Hi, new SkyPilot here"', - # The latest 2 version should be READY and the older versions should be shutting down - (_check_replica_in_status(name, [(2, False, 'READY'), - (2, False, 'SHUTTING_DOWN')]) + - _check_service_version(name, "2")), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_rolling_update(generic_cloud: str): - """Test skyserve with rolling update""" - name = _get_service_name() - single_new_replica = _check_replica_in_status( - name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, 'SHUTTING_DOWN')]) - test = smoke_tests_utils.Test( - f'test-skyserve-rolling-update', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml', - # Make sure the traffic is mixed across two versions, the replicas - # with even id will sleep 60 seconds before being ready, so we - # should be able to get observe the period that the traffic is mixed - # across two versions. - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; ' - # The latest version should have one READY and the one of the older versions should be shutting down - f'{single_new_replica} {_check_service_version(name, "1,2")} ' - # Check the output from the old version, immediately after the - # output from the new version appears. This is guaranteed by the - # round robin load balancing policy. - # TODO(zhwu): we should have a more generalized way for checking the - # mixed version of replicas to avoid depending on the specific - # round robin load balancing policy. - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_fast_update(generic_cloud: str): - """Test skyserve with fast update (Increment version of old replicas)""" - name = _get_service_name() - - test = smoke_tests_utils.Test( - f'test-skyserve-fast-update', - [ - f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', - # sleep to wait for update to be registered. - 'sleep 40', - # 2 on-deamnd (ready) + 1 on-demand (provisioning). - ( - _check_replica_in_status( - name, [(2, False, 'READY'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) + - # Fast update will directly have the latest version ready. - _check_service_version(name, "2")), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) + - _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - # Test rolling update - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', - # sleep to wait for update to be registered. - 'sleep 25', - # 2 on-deamnd (ready) + 1 on-demand (shutting down). - _check_replica_in_status(name, [(2, False, 'READY'), - (1, False, 'SHUTTING_DOWN')]), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.serve -def test_skyserve_update_autoscale(generic_cloud: str): - """Test skyserve update with autoscale""" - name = _get_service_name() - test = smoke_tests_utils.Test( - f'test-skyserve-update-autoscale', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml', - # sleep before update is registered. - 'sleep 20', - # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) + - _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here!"', - # Rolling Update - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', - # sleep before update is registered. - 'sleep 20', - # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here!"', - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=30 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack -@pytest.mark.serve -@pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes -@pytest.mark.parametrize('mode', ['rolling', 'blue_green']) -def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): - """Test skyserve with update that changes autoscaler""" - name = f'{_get_service_name()}-{mode}' - - wait_until_no_pending = ( - f's=$(sky serve status {name}); echo "$s"; ' - 'until ! echo "$s" | grep PENDING; do ' - ' echo "Waiting for replica to be out of pending..."; ' - f' sleep 5; s=$(sky serve status {name}); ' - ' echo "$s"; ' - 'done') - four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')]) - update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;'] - if mode == 'rolling': - # Check rolling update, it will terminate one of the old on-demand - # instances, once there are 4 spot instance ready. - update_check += [ - _check_replica_in_status( - name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) + - _check_service_version(name, "1,2"), - ] - else: - # Check blue green update, it will keep both old on-demand instances - # running, once there are 4 spot instance ready. - update_check += [ - _check_replica_in_status( - name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (2, False, 'READY')]) + - _check_service_version(name, "1"), - ] - test = smoke_tests_utils.Test( - f'test-skyserve-new-autoscaler-update-{mode}', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + - _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"', - f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml', - # Wait for update to be registered - f'sleep 90', - wait_until_no_pending, - _check_replica_in_status( - name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX), - (2, False, 'READY')]), - *update_check, - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' - 'curl http://$endpoint | grep "Hi, SkyPilot here"', - _check_replica_in_status(name, [(4, True, 'READY'), - (1, False, 'READY')]), - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -@pytest.mark.no_fluidstack -@pytest.mark.serve -def test_skyserve_failures(generic_cloud: str): - """Test replica failure statuses""" - name = _get_service_name() - - test = smoke_tests_utils.Test( - 'test-skyserve-failures', - [ - f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml', - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - 'sleep 60', - f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; ' - # Make sure no new replicas are started for early failure. - f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;', - f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml', - f's=$(sky serve status {name}); ' - # Wait for replica to be ready. - f'until echo "$s" | grep "READY"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done;', - # Wait for replica to change to FAILED_PROBING - f's=$(sky serve status {name}); ' - f'until echo "$s" | grep "FAILED_PROBING"; do ' - 'echo "Waiting for replica to be failed..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done', - # Wait for the PENDING replica to appear. - 'sleep 10', - # Wait until the replica is out of PENDING. - f's=$(sky serve status {name}); ' - f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do ' - 'echo "Waiting for replica to be out of pending..."; sleep 5; ' - f's=$(sky serve status {name}); echo "$s"; done; ' + - _check_replica_in_status( - name, [(1, False, 'FAILED_PROBING'), - (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]), - # TODO(zhwu): add test for FAILED_PROVISION - ], - _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -# TODO(Ziming, Tian): Add tests for autoscaling. - - -# ------- Testing user dependencies -------- -def test_user_dependencies(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'user-dependencies', - [ - f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"', - f'sky logs {name} 1 --status', - f'sky exec {name} "echo hi"', - f'sky logs {name} 2 --status', - f'sky status -r {name} | grep UP', - f'sky exec {name} "echo bye"', - f'sky logs {name} 3 --status', - f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml', - f'sky logs {name} 4 --status', - # Launch again to test the default env does not affect SkyPilot - # runtime setup - f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"', - f'sky logs {name} 5 --status', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ------- Testing the core API -------- -# Most of the core APIs have been tested in the CLI tests. -# These tests are for testing the return value of the APIs not fully used in CLI. - - -@pytest.mark.gcp -def test_core_api_sky_launch_exec(): - name = smoke_tests_utils.get_cluster_name() - task = sky.Task(run="whoami") - task.set_resources(sky.Resources(cloud=sky.GCP())) - job_id, handle = sky.launch(task, cluster_name=name) - assert job_id == 1 - assert handle is not None - assert handle.cluster_name == name - assert handle.launched_resources.cloud.is_same_cloud(sky.GCP()) - job_id_exec, handle_exec = sky.exec(task, cluster_name=name) - assert job_id_exec == 2 - assert handle_exec is not None - assert handle_exec.cluster_name == name - assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP()) - # For dummy task (i.e. task.run is None), the job won't be submitted. - dummy_task = sky.Task() - job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name) - assert job_id_dummy is None - sky.down(name) - - -# The sky launch CLI has some additional checks to make sure the cluster is up/ -# restarted. However, the core API doesn't have these; make sure it still works -def test_core_api_sky_launch_fast(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud) - try: - task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud)) - sky.launch(task, - cluster_name=name, - idle_minutes_to_autostop=1, - fast=True) - # Sleep to let the cluster autostop - smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( - cluster_name=name, - cluster_status=[sky.ClusterStatus.STOPPED], - timeout=120) - # Run it again - should work with fast=True - sky.launch(task, - cluster_name=name, - idle_minutes_to_autostop=1, - fast=True) - finally: - sky.down(name) - - -# ---------- Testing Storage ---------- -class TestStorageWithCredentials: - """Storage tests which require credentials and network connection""" - - AWS_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - '192.168.5.4', # formatted as an IP address - 'xn--bucket', # starts with 'xn--' prefix - 'bucket-s3alias', # ends with '-s3alias' suffix - 'bucket--ol-s3', # ends with '--ol-s3' suffix - '.abc', # starts with a dot - 'abc.', # ends with a dot - '-abc', # starts with a hyphen - 'abc-', # ends with a hyphen - ] - - GCS_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters (without dots) - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1' - # More than 63 characters between dots - 'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5, - # more than 222 characters (with dots) - '192.168.5.4', # formatted as an IP address - 'googbucket', # starts with 'goog' prefix - 'googlebucket', # contains 'google' - 'g00glebucket', # variant of 'google' - 'go0glebucket', # variant of 'google' - 'g0oglebucket', # variant of 'google' - '.abc', # starts with a dot - 'abc.', # ends with a dot - '_abc', # starts with an underscore - 'abc_', # ends with an underscore - ] - - AZURE_INVALID_NAMES = [ - 'ab', # less than 3 characters - # more than 63 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - 'Abcdef', # contains an uppercase letter - '.abc', # starts with a non-letter(dot) - 'a--bc', # contains consecutive hyphens - ] - - IBM_INVALID_NAMES = [ - 'ab', # less than 3 characters - 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1', - # more than 63 characters - 'Abcdef', # contains an uppercase letter - 'abc def', # contains a space - 'abc..def', # two adjacent periods - '192.168.5.4', # formatted as an IP address - 'xn--bucket', # starts with 'xn--' prefix - '.abc', # starts with a dot - 'abc.', # ends with a dot - '-abc', # starts with a hyphen - 'abc-', # ends with a hyphen - 'a.-bc', # contains the sequence '.-' - 'a-.bc', # contains the sequence '-.' - 'a&bc' # contains special characters - 'ab^c' # contains special characters - ] - GITIGNORE_SYNC_TEST_DIR_STRUCTURE = { - 'double_asterisk': { - 'double_asterisk_excluded': None, - 'double_asterisk_excluded_dir': { - 'dir_excluded': None, - }, - }, - 'double_asterisk_parent': { - 'parent': { - 'also_excluded.txt': None, - 'child': { - 'double_asterisk_parent_child_excluded.txt': None, - }, - 'double_asterisk_parent_excluded.txt': None, - }, - }, - 'excluded.log': None, - 'excluded_dir': { - 'excluded.txt': None, - 'nested_excluded': { - 'excluded': None, - }, - }, - 'exp-1': { - 'be_excluded': None, - }, - 'exp-2': { - 'be_excluded': None, - }, - 'front_slash_excluded': None, - 'included.log': None, - 'included.txt': None, - 'include_dir': { - 'excluded.log': None, - 'included.log': None, - }, - 'nested_double_asterisk': { - 'one': { - 'also_exclude.txt': None, - }, - 'two': { - 'also_exclude.txt': None, - }, - }, - 'nested_wildcard_dir': { - 'monday': { - 'also_exclude.txt': None, - }, - 'tuesday': { - 'also_exclude.txt': None, - }, - }, - 'no_slash_excluded': None, - 'no_slash_tests': { - 'no_slash_excluded': { - 'also_excluded.txt': None, - }, - }, - 'question_mark': { - 'excluded1.txt': None, - 'excluded@.txt': None, - }, - 'square_bracket': { - 'excluded1.txt': None, - }, - 'square_bracket_alpha': { - 'excludedz.txt': None, - }, - 'square_bracket_excla': { - 'excluded2.txt': None, - 'excluded@.txt': None, - }, - 'square_bracket_single': { - 'excluded0.txt': None, - }, - } - - @staticmethod - def create_dir_structure(base_path, structure): - # creates a given file STRUCTURE in BASE_PATH - for name, substructure in structure.items(): - path = os.path.join(base_path, name) - if substructure is None: - # Create a file - open(path, 'a', encoding='utf-8').close() - else: - # Create a subdirectory - os.mkdir(path) - TestStorageWithCredentials.create_dir_structure( - path, substructure) - - @staticmethod - def cli_delete_cmd(store_type, - bucket_name, - storage_account_name: str = None): - if store_type == storage_lib.StoreType.S3: - url = f's3://{bucket_name}' - return f'aws s3 rb {url} --force' - if store_type == storage_lib.StoreType.GCS: - url = f'gs://{bucket_name}' - gsutil_alias, alias_gen = data_utils.get_gsutil_command() - return f'{alias_gen}; {gsutil_alias} rm -r {url}' - if store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage container delete ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} ' - f'--name {bucket_name}') - if store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - url = f's3://{bucket_name}' - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2' - if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}' - - @staticmethod - def cli_ls_cmd(store_type, bucket_name, suffix=''): - if store_type == storage_lib.StoreType.S3: - if suffix: - url = f's3://{bucket_name}/{suffix}' - else: - url = f's3://{bucket_name}' - return f'aws s3 ls {url}' - if store_type == storage_lib.StoreType.GCS: - if suffix: - url = f'gs://{bucket_name}/{suffix}' - else: - url = f'gs://{bucket_name}' - return f'gsutil ls {url}' - if store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - config_storage_account = skypilot_config.get_nested( - ('azure', 'storage_account'), None) - storage_account_name = config_storage_account if ( - config_storage_account is not None) else ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - list_cmd = ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--prefix {shlex.quote(suffix)} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key}') - return list_cmd - if store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - if suffix: - url = f's3://{bucket_name}/{suffix}' - else: - url = f's3://{bucket_name}' - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' - if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}' - - @staticmethod - def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None): - if store_type == storage_lib.StoreType.S3: - assert bucket_name is not None - return ('aws s3api get-bucket-location ' - f'--bucket {bucket_name} --output text') - elif store_type == storage_lib.StoreType.GCS: - assert bucket_name is not None - return (f'gsutil ls -L -b gs://{bucket_name}/ | ' - 'grep "Location constraint" | ' - 'awk \'{print tolower($NF)}\'') - elif store_type == storage_lib.StoreType.AZURE: - # For Azure Blob Storage, the location of the containers are - # determined by the location of storage accounts. - assert storage_account_name is not None - return (f'az storage account show --name {storage_account_name} ' - '--query "primaryLocation" --output tsv') - else: - raise NotImplementedError(f'Region command not implemented for ' - f'{store_type}') - - @staticmethod - def cli_count_name_in_bucket(store_type, - bucket_name, - file_name, - suffix='', - storage_account_name=None): - if store_type == storage_lib.StoreType.S3: - if suffix: - return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' - else: - return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' - elif store_type == storage_lib.StoreType.GCS: - if suffix: - return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l' - else: - return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l' - elif store_type == storage_lib.StoreType.AZURE: - if storage_account_name is None: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--prefix {shlex.quote(suffix)} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} | ' - f'grep {file_name} | ' - 'wc -l') - elif store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - if suffix: - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' - else: - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' - - @staticmethod - def cli_count_file_in_bucket(store_type, bucket_name): - if store_type == storage_lib.StoreType.S3: - return f'aws s3 ls s3://{bucket_name} --recursive | wc -l' - elif store_type == storage_lib.StoreType.GCS: - return f'gsutil ls -r gs://{bucket_name}/** | wc -l' - elif store_type == storage_lib.StoreType.AZURE: - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - return ('az storage blob list ' - f'--container-name {bucket_name} ' - f'--account-name {storage_account_name} ' - f'--account-key {storage_account_key} | ' - 'grep \\"name\\": | ' - 'wc -l') - elif store_type == storage_lib.StoreType.R2: - endpoint_url = cloudflare.create_endpoint() - return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l' - - @pytest.fixture - def tmp_source(self, tmp_path): - # Creates a temporary directory with a file in it - tmp_dir = tmp_path / 'tmp-source' - tmp_dir.mkdir() - tmp_file = tmp_dir / 'tmp-file' - tmp_file.write_text('test') - circle_link = tmp_dir / 'circle-link' - circle_link.symlink_to(tmp_dir, target_is_directory=True) - yield str(tmp_dir) - - @staticmethod - def generate_bucket_name(): - # Creates a temporary bucket name - # time.time() returns varying precision on different systems, so we - # replace the decimal point and use whatever precision we can get. - timestamp = str(time.time()).replace('.', '') - return f'sky-test-{timestamp}' - - @pytest.fixture - def tmp_bucket_name(self): - yield self.generate_bucket_name() - - @staticmethod - def yield_storage_object( - name: Optional[str] = None, - source: Optional[storage_lib.Path] = None, - stores: Optional[Dict[storage_lib.StoreType, - storage_lib.AbstractStore]] = None, - persistent: Optional[bool] = True, - mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT): - # Creates a temporary storage object. Stores must be added in the test. - storage_obj = storage_lib.Storage(name=name, - source=source, - stores=stores, - persistent=persistent, - mode=mode) - yield storage_obj - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - # If handle exists, delete manually - # TODO(romilb): This is potentially risky - if the delete method has - # bugs, this can cause resource leaks. Ideally we should manually - # eject storage from global_user_state and delete the bucket using - # boto3 directly. - storage_obj.delete() - - @pytest.fixture - def tmp_scratch_storage_obj(self, tmp_bucket_name): - # Creates a storage object with no source to create a scratch storage. - # Stores must be added in the test. - yield from self.yield_storage_object(name=tmp_bucket_name) - - @pytest.fixture - def tmp_multiple_scratch_storage_obj(self): - # Creates a list of 5 storage objects with no source to create - # multiple scratch storages. - # Stores for each object in the list must be added in the test. - storage_mult_obj = [] - for _ in range(5): - timestamp = str(time.time()).replace('.', '') - store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}') - storage_mult_obj.append(store_obj) - yield storage_mult_obj - for storage_obj in storage_mult_obj: - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - # If handle exists, delete manually - # TODO(romilb): This is potentially risky - if the delete method has - # bugs, this can cause resource leaks. Ideally we should manually - # eject storage from global_user_state and delete the bucket using - # boto3 directly. - storage_obj.delete() - - @pytest.fixture - def tmp_multiple_custom_source_storage_obj(self): - # Creates a list of storage objects with custom source names to - # create multiple scratch storages. - # Stores for each object in the list must be added in the test. - custom_source_names = ['"path With Spaces"', 'path With Spaces'] - storage_mult_obj = [] - for name in custom_source_names: - src_path = os.path.expanduser(f'~/{name}') - pathlib.Path(src_path).expanduser().mkdir(exist_ok=True) - timestamp = str(time.time()).replace('.', '') - store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}', - source=src_path) - storage_mult_obj.append(store_obj) - yield storage_mult_obj - for storage_obj in storage_mult_obj: - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - storage_obj.delete() - - @pytest.fixture - def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source): - # Creates a temporary storage object. Stores must be added in the test. - yield from self.yield_storage_object(name=tmp_bucket_name, - source=tmp_source) - - @pytest.fixture - def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source): - # Creates a temp storage object which uses a list of paths as source. - # Stores must be added in the test. After upload, the bucket should - # have two files - /tmp-file and /tmp-source/tmp-file - list_source = [tmp_source, tmp_source + '/tmp-file'] - yield from self.yield_storage_object(name=tmp_bucket_name, - source=list_source) - - @pytest.fixture - def tmp_bulk_del_storage_obj(self, tmp_bucket_name): - # Creates a temporary storage object for testing bulk deletion. - # Stores must be added in the test. - with tempfile.TemporaryDirectory() as tmpdir: - subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}', - shell=True) - subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt', - shell=True) - subprocess.check_output( - f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True) - yield from self.yield_storage_object(name=tmp_bucket_name, - source=tmpdir) - - @pytest.fixture - def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj): - # Creates a copy mount storage which reuses an existing storage object. - tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3) - storage_name = tmp_scratch_storage_obj.name - - # Try to initialize another storage with the storage object created - # above, but now in COPY mode. This should succeed. - yield from self.yield_storage_object(name=storage_name, - mode=storage_lib.StorageMode.COPY) - - @pytest.fixture - def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure): - # Creates a temporary storage object for testing .gitignore filter. - # GITIGINORE_STRUCTURE is representing a file structure in a dictionary - # format. Created storage object will contain the file structure along - # with .gitignore and .git/info/exclude files to test exclude filter. - # Stores must be added in the test. - with tempfile.TemporaryDirectory() as tmpdir: - # Creates file structure to be uploaded in the Storage - self.create_dir_structure(tmpdir, gitignore_structure) - - # Create .gitignore and list files/dirs to be excluded in it - skypilot_path = os.path.dirname(os.path.dirname(sky.__file__)) - temp_path = f'{tmpdir}/.gitignore' - file_path = os.path.join(skypilot_path, 'tests/gitignore_test') - shutil.copyfile(file_path, temp_path) - - # Create .git/info/exclude and list files/dirs to be excluded in it - temp_path = f'{tmpdir}/.git/info/' - os.makedirs(temp_path) - temp_exclude_path = os.path.join(temp_path, 'exclude') - file_path = os.path.join(skypilot_path, - 'tests/git_info_exclude_test') - shutil.copyfile(file_path, temp_exclude_path) - - # Create sky Storage with the files created - yield from self.yield_storage_object( - name=tmp_bucket_name, - source=tmpdir, - mode=storage_lib.StorageMode.COPY) - - @pytest.fixture - def tmp_awscli_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using awscli - bucket_uri = f's3://{tmp_bucket_name}' - subprocess.check_call(['aws', 's3', 'mb', bucket_uri]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force']) - - @pytest.fixture - def tmp_gsutil_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using gsutil - bucket_uri = f'gs://{tmp_bucket_name}' - subprocess.check_call(['gsutil', 'mb', bucket_uri]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri]) - - @pytest.fixture - def tmp_az_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using gsutil - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - bucket_uri = data_utils.AZURE_CONTAINER_URL.format( - storage_account_name=storage_account_name, - container_name=tmp_bucket_name) - subprocess.check_call([ - 'az', 'storage', 'container', 'create', '--name', - f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', - '--account-key', f'{storage_account_key}' - ]) - yield tmp_bucket_name, bucket_uri - subprocess.check_call([ - 'az', 'storage', 'container', 'delete', '--name', - f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}', - '--account-key', f'{storage_account_key}' - ]) - - @pytest.fixture - def tmp_awscli_bucket_r2(self, tmp_bucket_name): - # Creates a temporary bucket using awscli - endpoint_url = cloudflare.create_endpoint() - bucket_uri = f's3://{tmp_bucket_name}' - subprocess.check_call( - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2', - shell=True) - yield tmp_bucket_name, bucket_uri - subprocess.check_call( - f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2', - shell=True) - - @pytest.fixture - def tmp_ibm_cos_bucket(self, tmp_bucket_name): - # Creates a temporary bucket using IBM COS API - storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name) - yield tmp_bucket_name - storage_obj.delete() - - @pytest.fixture - def tmp_public_storage_obj(self, request): - # Initializes a storage object with a public bucket - storage_obj = storage_lib.Storage(source=request.param) - yield storage_obj - # This does not require any deletion logic because it is a public bucket - # and should not get added to global_user_state. - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, - store_type): - # Creates a new bucket with a local source, uploads files to it - # and deletes it. - tmp_local_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_local_storage_obj.name in out.decode('utf-8') - - # Run sky storage delete to delete the storage object - subprocess.check_output( - ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes']) - - # Run sky storage ls to check if storage object is deleted - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_local_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.xdist_group('multiple_bucket_deletion') - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm) - ]) - def test_multiple_buckets_creation_and_deletion( - self, tmp_multiple_scratch_storage_obj, store_type): - # Creates multiple new buckets(5 buckets) with a local source - # and deletes them. - storage_obj_name = [] - for store_obj in tmp_multiple_scratch_storage_obj: - store_obj.add_store(store_type) - storage_obj_name.append(store_obj.name) - - # Run sky storage ls to check if all storage objects exists in the - # output filtered by store type - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item in out for item in storage_obj_name]) - - # Run sky storage delete all to delete all storage objects - delete_cmd = ['sky', 'storage', 'delete', '--yes'] - delete_cmd += storage_obj_name - subprocess.check_output(delete_cmd) - - # Run sky storage ls to check if all storage objects filtered by store - # type are deleted - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item not in out for item in storage_obj_name]) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_upload_source_with_spaces(self, store_type, - tmp_multiple_custom_source_storage_obj): - # Creates two buckets with specified local sources - # with spaces in the name - storage_obj_names = [] - for storage_obj in tmp_multiple_custom_source_storage_obj: - storage_obj.add_store(store_type) - storage_obj_names.append(storage_obj.name) - - # Run sky storage ls to check if all storage objects exists in the - # output filtered by store type - out_all = subprocess.check_output(['sky', 'storage', 'ls']) - out = [ - item.split()[0] - for item in out_all.decode('utf-8').splitlines() - if store_type.value in item - ] - assert all([item in out for item in storage_obj_names]) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_bucket_external_deletion(self, tmp_scratch_storage_obj, - store_type): - # Creates a bucket, deletes it externally using cloud cli commands - # and then tries to delete it using sky storage delete. - tmp_scratch_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_scratch_storage_obj.name in out.decode('utf-8') - - # Delete bucket externally - cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name) - subprocess.check_output(cmd, shell=True) - - # Run sky storage delete to delete the storage object - out = subprocess.check_output( - ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes']) - # Make sure bucket was not created during deletion (see issue #1322) - assert 'created' not in out.decode('utf-8').lower() - - # Run sky storage ls to check if storage object is deleted - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_scratch_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj): - # Creates a temp folder with over 256 files and folders, upload - # files and folders to a new bucket, then delete bucket. - tmp_bulk_del_storage_obj.add_store(store_type) - - subprocess.check_output([ - 'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes' - ]) - - output = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'tmp_public_storage_obj, store_type', - [('s3://tcga-2-open', storage_lib.StoreType.S3), - ('s3://digitalcorpora', storage_lib.StoreType.S3), - ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS), - pytest.param( - 'https://azureopendatastorage.blob.core.windows.net/nyctlc', - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure)], - indirect=['tmp_public_storage_obj']) - def test_public_bucket(self, tmp_public_storage_obj, store_type): - # Creates a new bucket with a public source and verifies that it is not - # added to global_user_state. - tmp_public_storage_obj.add_store(store_type) - - # Run sky storage ls to check if storage object exists in the output - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert tmp_public_storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'nonexist_bucket_url', - [ - 's3://{random_name}', - 'gs://{random_name}', - pytest.param( - 'https://{account_name}.blob.core.windows.net/{random_name}', # pylint: disable=line-too-long - marks=pytest.mark.azure), - pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm), - pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare) - ]) - def test_nonexistent_bucket(self, nonexist_bucket_url): - # Attempts to create fetch a stroage with a non-existent source. - # Generate a random bucket name and verify it doesn't exist: - retry_count = 0 - while True: - nonexist_bucket_name = str(uuid.uuid4()) - if nonexist_bucket_url.startswith('s3'): - command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}' - expected_output = '404' - elif nonexist_bucket_url.startswith('gs'): - command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}' - expected_output = 'BucketNotFoundException' - elif nonexist_bucket_url.startswith('https'): - default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.get_default_storage_account_name( - default_region)) - storage_account_key = data_utils.get_az_storage_account_key( - storage_account_name) - command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}' - expected_output = '"exists": false' - elif nonexist_bucket_url.startswith('r2'): - endpoint_url = cloudflare.create_endpoint() - command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2' - expected_output = '404' - elif nonexist_bucket_url.startswith('cos'): - # Using API calls, since using rclone requires a profile's name - try: - expected_output = command = "echo" # avoid unrelated exception in case of failure. - bucket_name = urllib.parse.urlsplit( - nonexist_bucket_url.format( - random_name=nonexist_bucket_name)).path.strip('/') - client = ibm.get_cos_client('us-east') - client.head_bucket(Bucket=bucket_name) - except ibm.ibm_botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == '404': - # success - return - else: - raise ValueError('Unsupported bucket type ' - f'{nonexist_bucket_url}') - - # Check if bucket exists using the cli: - try: - out = subprocess.check_output(command, - stderr=subprocess.STDOUT, - shell=True) - except subprocess.CalledProcessError as e: - out = e.output - out = out.decode('utf-8') - if expected_output in out: - break - else: - retry_count += 1 - if retry_count > 3: - raise RuntimeError('Unable to find a nonexistent bucket ' - 'to use. This is higly unlikely - ' - 'check if the tests are correct.') - - with pytest.raises(sky.exceptions.StorageBucketGetError, - match='Attempted to use a non-existent'): - if nonexist_bucket_url.startswith('https'): - storage_obj = storage_lib.Storage( - source=nonexist_bucket_url.format( - account_name=storage_account_name, - random_name=nonexist_bucket_name)) - else: - storage_obj = storage_lib.Storage( - source=nonexist_bucket_url.format( - random_name=nonexist_bucket_name)) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'private_bucket', - [ - f's3://imagenet', - f'gs://imagenet', - pytest.param('https://smoketestprivate.blob.core.windows.net/test', - marks=pytest.mark.azure), # pylint: disable=line-too-long - pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm) - ]) - def test_private_bucket(self, private_bucket): - # Attempts to access private buckets not belonging to the user. - # These buckets are known to be private, but may need to be updated if - # they are removed by their owners. - store_type = urllib.parse.urlsplit(private_bucket).scheme - if store_type == 'https' or store_type == 'cos': - private_bucket_name = urllib.parse.urlsplit( - private_bucket).path.strip('/') - else: - private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc - with pytest.raises( - sky.exceptions.StorageBucketGetError, - match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format( - name=private_bucket_name)): - storage_obj = storage_lib.Storage(source=private_bucket) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('ext_bucket_fixture, store_type', - [('tmp_awscli_bucket', storage_lib.StoreType.S3), - ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), - pytest.param('tmp_az_bucket', - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure), - pytest.param('tmp_ibm_cos_bucket', - storage_lib.StoreType.IBM, - marks=pytest.mark.ibm), - pytest.param('tmp_awscli_bucket_r2', - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_upload_to_existing_bucket(self, ext_bucket_fixture, request, - tmp_source, store_type): - # Tries uploading existing files to newly created bucket (outside of - # sky) and verifies that files are written. - bucket_name, _ = request.getfixturevalue(ext_bucket_fixture) - storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source) - storage_obj.add_store(store_type) - - # Check if tmp_source/tmp-file exists in the bucket using aws cli - out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - # Check symlinks - symlinks don't get copied by sky storage - assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), ( - 'circle-link was not found in the upload source - ' - 'are the test fixtures correct?') - assert 'circle-link' not in out.decode('utf-8'), ( - 'Symlink found in bucket - ls output was : {}'.format( - out.decode('utf-8'))) - - # Run sky storage ls to check if storage object exists in the output. - # It should not exist because the bucket was created externally. - out = subprocess.check_output(['sky', 'storage', 'ls']) - assert storage_obj.name not in out.decode('utf-8') - - @pytest.mark.no_fluidstack - def test_copy_mount_existing_storage(self, - tmp_copy_mnt_existing_storage_obj): - # Creates a bucket with no source in MOUNT mode (empty bucket), and - # then tries to load the same storage in COPY mode. - tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3) - storage_name = tmp_copy_mnt_existing_storage_obj.name - - # Check `sky storage ls` to ensure storage object exists - out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8') - assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.' - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('store_type', [ - storage_lib.StoreType.S3, storage_lib.StoreType.GCS, - pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure), - pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm), - pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare) - ]) - def test_list_source(self, tmp_local_list_storage_obj, store_type): - # Uses a list in the source field to specify a file and a directory to - # be uploaded to the storage object. - tmp_local_list_storage_obj.add_store(store_type) - - # Check if tmp-file exists in the bucket root using cli - out = subprocess.check_output(self.cli_ls_cmd( - store_type, tmp_local_list_storage_obj.name), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - # Check if tmp-file exists in the bucket/tmp-source using cli - out = subprocess.check_output(self.cli_ls_cmd( - store_type, tmp_local_list_storage_obj.name, 'tmp-source/'), - shell=True) - assert 'tmp-file' in out.decode('utf-8'), \ - 'File not found in bucket - output was : {}'.format(out.decode - ('utf-8')) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('invalid_name_list, store_type', - [(AWS_INVALID_NAMES, storage_lib.StoreType.S3), - (GCS_INVALID_NAMES, storage_lib.StoreType.GCS), - pytest.param(AZURE_INVALID_NAMES, - storage_lib.StoreType.AZURE, - marks=pytest.mark.azure), - pytest.param(IBM_INVALID_NAMES, - storage_lib.StoreType.IBM, - marks=pytest.mark.ibm), - pytest.param(AWS_INVALID_NAMES, - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_invalid_names(self, invalid_name_list, store_type): - # Uses a list in the source field to specify a file and a directory to - # be uploaded to the storage object. - for name in invalid_name_list: - with pytest.raises(sky.exceptions.StorageNameError): - storage_obj = storage_lib.Storage(name=name) - storage_obj.add_store(store_type) - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize( - 'gitignore_structure, store_type', - [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3), - (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS), - (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE), - pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, - store_type, - tmp_gitignore_storage_obj): - # tests if files included in .gitignore and .git/info/exclude are - # excluded from being transferred to Storage - - tmp_gitignore_storage_obj.add_store(store_type) - - upload_file_name = 'included' - # Count the number of files with the given file name - up_cmd = self.cli_count_name_in_bucket(store_type, \ - tmp_gitignore_storage_obj.name, file_name=upload_file_name) - git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \ - tmp_gitignore_storage_obj.name, file_name='.git') - cnt_num_file_cmd = self.cli_count_file_in_bucket( - store_type, tmp_gitignore_storage_obj.name) - - up_output = subprocess.check_output(up_cmd, shell=True) - git_exclude_output = subprocess.check_output(git_exclude_cmd, - shell=True) - cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True) - - assert '3' in up_output.decode('utf-8'), \ - 'Files to be included are not completely uploaded.' - # 1 is read as .gitignore is uploaded - assert '1' in git_exclude_output.decode('utf-8'), \ - '.git directory should not be uploaded.' - # 4 files include .gitignore, included.log, included.txt, include_dir/included.log - assert '4' in cnt_output.decode('utf-8'), \ - 'Some items listed in .gitignore and .git/info/exclude are not excluded.' - - @pytest.mark.parametrize('ext_bucket_fixture, store_type', - [('tmp_awscli_bucket', storage_lib.StoreType.S3), - ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), - pytest.param('tmp_awscli_bucket_r2', - storage_lib.StoreType.R2, - marks=pytest.mark.cloudflare)]) - def test_externally_created_bucket_mount_without_source( - self, ext_bucket_fixture, request, store_type): - # Non-sky managed buckets(buckets created outside of Skypilot CLI) - # are allowed to be MOUNTed by specifying the URI of the bucket to - # source field only. When it is attempted by specifying the name of - # the bucket only, it should error out. - # - # TODO(doyoung): Add test for IBM COS. Currently, this is blocked - # as rclone used to interact with IBM COS does not support feature to - # create a bucket, and the ibmcloud CLI is not supported in Skypilot. - # Either of the feature is necessary to simulate an external bucket - # creation for IBM COS. - # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837 - - ext_bucket_name, ext_bucket_uri = request.getfixturevalue( - ext_bucket_fixture) - # invalid spec - with pytest.raises(sky.exceptions.StorageSpecError) as e: - storage_obj = storage_lib.Storage( - name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT) - storage_obj.add_store(store_type) - - assert 'Attempted to mount a non-sky managed bucket' in str(e) - - # valid spec - storage_obj = storage_lib.Storage(source=ext_bucket_uri, - mode=storage_lib.StorageMode.MOUNT) - handle = global_user_state.get_handle_from_storage_name( - storage_obj.name) - if handle: - storage_obj.delete() - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('region', [ - 'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1', - 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1', - 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1', - 'us-east-2', 'us-west-1', 'us-west-2' - ]) - def test_aws_regions(self, tmp_local_storage_obj, region): - # This tests creation and upload to bucket in all AWS s3 regions - # To test full functionality, use test_managed_jobs_storage above. - store_type = storage_lib.StoreType.S3 - tmp_local_storage_obj.add_store(store_type, region=region) - bucket_name = tmp_local_storage_obj.name - - # Confirm that the bucket was created in the correct region - region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) - out = subprocess.check_output(region_cmd, shell=True) - output = out.decode('utf-8') - expected_output_region = region - if region == 'us-east-1': - expected_output_region = 'None' # us-east-1 is the default region - assert expected_output_region in out.decode('utf-8'), ( - f'Bucket was not found in region {region} - ' - f'output of {region_cmd} was: {output}') - - # Check if tmp_source/tmp-file exists in the bucket using cli - ls_cmd = self.cli_ls_cmd(store_type, bucket_name) - out = subprocess.check_output(ls_cmd, shell=True) - output = out.decode('utf-8') - assert 'tmp-file' in output, ( - f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') - - @pytest.mark.no_fluidstack - @pytest.mark.parametrize('region', [ - 'northamerica-northeast1', 'northamerica-northeast2', 'us-central1', - 'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2', - 'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1', - 'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1', - 'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6', - 'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12', - 'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2', - 'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2', - 'asia-southeast2', 'me-central1', 'me-central2', 'me-west1', - 'australia-southeast1', 'australia-southeast2', 'africa-south1' - ]) - def test_gcs_regions(self, tmp_local_storage_obj, region): - # This tests creation and upload to bucket in all GCS regions - # To test full functionality, use test_managed_jobs_storage above. - store_type = storage_lib.StoreType.GCS - tmp_local_storage_obj.add_store(store_type, region=region) - bucket_name = tmp_local_storage_obj.name - - # Confirm that the bucket was created in the correct region - region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name) - out = subprocess.check_output(region_cmd, shell=True) - output = out.decode('utf-8') - assert region in out.decode('utf-8'), ( - f'Bucket was not found in region {region} - ' - f'output of {region_cmd} was: {output}') - - # Check if tmp_source/tmp-file exists in the bucket using cli - ls_cmd = self.cli_ls_cmd(store_type, bucket_name) - out = subprocess.check_output(ls_cmd, shell=True) - output = out.decode('utf-8') - assert 'tmp-file' in output, ( - f'tmp-file not found in bucket - output of {ls_cmd} was: {output}') - - -# ---------- Testing YAML Specs ---------- -# Our sky storage requires credentials to check the bucket existance when -# loading a task from the yaml file, so we cannot make it a unit test. -class TestYamlSpecs: - # TODO(zhwu): Add test for `to_yaml_config` for the Storage object. - # We should not use `examples/storage_demo.yaml` here, since it requires - # users to ensure bucket names to not exist and/or be unique. - _TEST_YAML_PATHS = [ - 'examples/minimal.yaml', 'examples/managed_job.yaml', - 'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml', - 'examples/multi_hostname.yaml' - ] - - def _is_dict_subset(self, d1, d2): - """Check if d1 is the subset of d2.""" - for k, v in d1.items(): - if k not in d2: - if isinstance(v, list) or isinstance(v, dict): - assert len(v) == 0, (k, v) - else: - assert False, (k, v) - elif isinstance(v, dict): - assert isinstance(d2[k], dict), (k, v, d2) - self._is_dict_subset(v, d2[k]) - elif isinstance(v, str): - if k == 'accelerators': - resources = sky.Resources() - resources._set_accelerators(v, None) - assert resources.accelerators == d2[k], (k, v, d2) - else: - assert v.lower() == d2[k].lower(), (k, v, d2[k]) - else: - assert v == d2[k], (k, v, d2[k]) - - def _check_equivalent(self, yaml_path): - """Check if the yaml is equivalent after load and dump again.""" - origin_task_config = common_utils.read_yaml(yaml_path) - - task = sky.Task.from_yaml(yaml_path) - new_task_config = task.to_yaml_config() - # d1 <= d2 - print(origin_task_config, new_task_config) - self._is_dict_subset(origin_task_config, new_task_config) - - def test_load_dump_yaml_config_equivalent(self): - """Test if the yaml config is equivalent after load and dump again.""" - pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/tmpfile').expanduser().touch() - pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch() - pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True) - pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True, - exist_ok=True) - for yaml_path in self._TEST_YAML_PATHS: - self._check_equivalent(yaml_path) - - -# ---------- Testing Multiple Accelerators ---------- -@pytest.mark.no_fluidstack # Fluidstack does not support K80 gpus for now -@pytest.mark.no_paperspace # Paperspace does not support K80 gpus -def test_multiple_accelerators_ordered(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multiple-accelerators-ordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - timeout=20 * 60, - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_ordered_with_default(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multiple-accelerators-ordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status {name} | grep Spot', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_unordered(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multiple-accelerators-unordered', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Fluidstack has low availability for T4 GPUs -@pytest.mark.no_paperspace # Paperspace does not support T4 GPUs -def test_multiple_accelerators_unordered_with_default(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multiple-accelerators-unordered-with-default', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky status {name} | grep Spot', - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.no_fluidstack # Requires other clouds to be enabled -def test_multiple_resources(): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'multiple-resources', - [ - f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml', - f'sky logs {name} 1 --status', # Ensure the job succeeded. - ], - f'sky down -y {name}', - ) - smoke_tests_utils.run_one_test(test) - - -# ---------- Sky Benchmark ---------- -@pytest.mark.no_fluidstack # Requires other clouds to be enabled -@pytest.mark.no_paperspace # Requires other clouds to be enabled -@pytest.mark.no_kubernetes -@pytest.mark.aws # SkyBenchmark requires S3 access -def test_sky_bench(generic_cloud: str): - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'sky-bench', - [ - f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml', - 'sleep 120', - f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED', - ], - f'sky bench down {name} -y; sky bench delete {name} -y', - ) - smoke_tests_utils.run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_context_failover(): - """Test if the kubernetes context failover works. - - This test requires two kubernetes clusters: - - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs. - - another accessible cluster: with enough CPUs - To start the first cluster, run: - sky local up - # Add mock label for accelerator - kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot - # Get the token for the cluster in context kind-skypilot - TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\') - # Get the API URL for the cluster in context kind-skypilot - API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\') - # Add mock capacity for GPU - curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status" - # Add a new namespace to test the handling of namespaces - kubectl create namespace test-namespace --context kind-skypilot - # Set the namespace to test-namespace - kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot - """ - # Get context that is not kind-skypilot - contexts = subprocess.check_output('kubectl config get-contexts -o name', - shell=True).decode('utf-8').split('\n') - context = [context for context in contexts if context != 'kind-skypilot'][0] - config = textwrap.dedent(f"""\ - kubernetes: - allowed_contexts: - - kind-skypilot - - {context} - """) - with tempfile.NamedTemporaryFile(delete=True) as f: - f.write(config.encode('utf-8')) - f.flush() - name = smoke_tests_utils.get_cluster_name() - test = smoke_tests_utils.Test( - 'kubernetes-context-failover', - [ - # Check if kind-skypilot is provisioned with H100 annotations already - 'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && ' - 'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && ' - 'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || ' - '{ echo "kind-skypilot does not exist ' - 'or does not have mock labels for GPUs. Check the instructions in ' - 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', - # Check namespace for kind-skypilot is test-namespace - 'kubectl get namespaces --context kind-skypilot | grep test-namespace || ' - '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in ' - 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', - 'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"', - # Get contexts and set current context to the other cluster that is not kind-skypilot - f'kubectl config use-context {context}', - # H100 should not in the current context - '! sky show-gpus --cloud kubernetes | grep H100', - f'sky launch -y -c {name}-1 --cpus 1 echo hi', - f'sky logs {name}-1 --status', - # It should be launched not on kind-skypilot - f'sky status -a {name}-1 | grep "{context}"', - # Test failure for launching H100 on other cluster - f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true', - # Test failover - f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi', - f'sky logs {name}-3 --status', - # Test pods - f'kubectl get pods --context kind-skypilot | grep "{name}-3"', - # It should be launched on kind-skypilot - f'sky status -a {name}-3 | grep "kind-skypilot"', - # Should be 7 free GPUs - f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep " 7"', - # Remove the line with "kind-skypilot" - f'sed -i "/kind-skypilot/d" {f.name}', - # Should still be able to exec and launch on existing cluster - f'sky exec {name}-3 "echo hi"', - f'sky logs {name}-3 --status', - f'sky status -r {name}-3 | grep UP', - f'sky launch -c {name}-3 --gpus h100 echo hi', - f'sky logs {name}-3 --status', - f'sky status -r {name}-3 | grep UP', - ], - f'sky down -y {name}-1 {name}-3', - env={'SKYPILOT_CONFIG': f.name}, - ) - smoke_tests_utils.run_one_test(test) diff --git a/tests/test_yamls/minimal_test_required_before_merge.yaml b/tests/test_yamls/minimal_test_pre_merge.yaml similarity index 60% rename from tests/test_yamls/minimal_test_required_before_merge.yaml rename to tests/test_yamls/minimal_test_pre_merge.yaml index aceb5a76cb0..583575bee5c 100644 --- a/tests/test_yamls/minimal_test_required_before_merge.yaml +++ b/tests/test_yamls/minimal_test_pre_merge.yaml @@ -10,4 +10,4 @@ workdir: . num_nodes: 1 run: | - ls -l ~/aws/tests/test_yamls/minimal_test_required_before_merge.yaml + ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml From f29637fa2f7fddfff7ada61b35edfa9e0aa289a1 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 11 Dec 2024 18:30:29 +0800 Subject: [PATCH 61/64] fix import --- .buildkite/generate_pipeline.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 8f1389d409a..636923ae37a 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -7,9 +7,9 @@ ├── test_*.py -> release pipeline ├── test_pre_merge.py -> pre-merge pipeline -run `python .buildkite/generate_pipeline.py` to generate the pipeline for -testing. The CI will run this script as a pre-step, and use the generated -pipeline to run the tests. +run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py` +to generate the pipeline for testing. The CI will run this script as a pre-step, +and use the generated pipeline to run the tests. 1. release pipeline, which runs all smoke tests by default, generates all smoke tests for all clouds. @@ -27,14 +27,9 @@ import sys from typing import Any, Dict, List, Optional -import yaml - -# Add project root to Python path -tests_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'tests') -sys.path.append(tests_path) - from conftest import all_clouds_in_smoke_tests from conftest import default_clouds_to_run +import yaml DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests From 010f4afed3a6989d7de8821386928b4acf5dd776 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Wed, 11 Dec 2024 18:48:20 +0800 Subject: [PATCH 62/64] fix import --- .buildkite/generate_pipeline.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 636923ae37a..62e304ffcda 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -24,15 +24,14 @@ import ast import os import random -import sys from typing import Any, Dict, List, Optional -from conftest import all_clouds_in_smoke_tests +from conftest import cloud_to_pytest_keyword from conftest import default_clouds_to_run import yaml DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run -ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests +PYTEST_TO_CLOUD_KEYWORD = {v: k for k, v in cloud_to_pytest_keyword.items()} QUEUE_GENERIC_CLOUD = 'generic_cloud' QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve' @@ -119,10 +118,11 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]: if suffix == 'serve': is_serve_test = True continue - if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS: + if suffix not in PYTEST_TO_CLOUD_KEYWORD: # This mark does not specify a cloud, so we skip it. continue - clouds_to_include.append(suffix) + clouds_to_include.append( + PYTEST_TO_CLOUD_KEYWORD[suffix]) clouds_to_include = (clouds_to_include if clouds_to_include else DEFAULT_CLOUDS_TO_RUN) clouds_to_include = [ From 8d5d023ddbcfeabbfc9b284e6a363607e609d916 Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 12 Dec 2024 10:51:59 +0800 Subject: [PATCH 63/64] support gcp on pre merge test --- .buildkite/generate_pipeline.py | 3 +-- tests/smoke_tests/test_pre_merge.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 62e304ffcda..2b0f1cec788 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -222,8 +222,7 @@ def _convert_pre_merge(test_files: List[str]): 'command': 'bash tests/backward_compatibility_tests.sh', 'agents': { 'queue': 'back_compat' - }, - 'if': 'build.env("aws") == "1"' + } }) output_file_pipelines.append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py index a2da638b8de..5254b289df1 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_pre_merge.py @@ -2,29 +2,30 @@ # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/smoke_tests/test_required_before_merge.py +# > pytest tests/smoke_tests/test_pre_merge.py # # Terminate failed clusters after test finishes -# > pytest tests/smoke_tests/test_required_before_merge.py --terminate-on-failure +# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount +# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount # # Only run test for AWS + generic tests -# > pytest tests/smoke_tests/test_required_before_merge.py --aws -# -# Change cloud for generic tests to aws -# > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws +# > pytest tests/smoke_tests/test_pre_merge.py --aws +import pytest from smoke_tests import smoke_tests_utils import sky -def test_yaml_launch_and_mount(generic_cloud: str): +@pytest.mark.aws +@pytest.mark.azure +@pytest.mark.gcp +def test_yaml_launch_and_mount(): name = smoke_tests_utils.get_cluster_name() test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', From 0bd7d044cc59692c1b16d1222c0bd17c01e66fce Mon Sep 17 00:00:00 2001 From: ZePing Guo Date: Thu, 12 Dec 2024 11:37:32 +0800 Subject: [PATCH 64/64] no gcp test case for pre merge --- tests/smoke_tests/test_pre_merge.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py index 5254b289df1..4890ac15ce4 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_pre_merge.py @@ -15,17 +15,16 @@ # # Only run test for AWS + generic tests # > pytest tests/smoke_tests/test_pre_merge.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws -import pytest from smoke_tests import smoke_tests_utils import sky -@pytest.mark.aws -@pytest.mark.azure -@pytest.mark.gcp -def test_yaml_launch_and_mount(): +def test_yaml_launch_and_mount(generic_cloud: str): name = smoke_tests_utils.get_cluster_name() test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount',