From 468409c51d3daf8a0df6632504a8cff0f324429e Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Thu, 7 Nov 2024 17:03:08 +0800
Subject: [PATCH 01/64] event based smoke test

---
 tests/test_smoke.py | 99 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 12 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index cdfd9dfc7cb..b51e720e84a 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -25,6 +25,7 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 
+import enum
 import inspect
 import json
 import os
@@ -60,6 +61,8 @@
 from sky.data.data_utils import Rclone
 from sky.skylet import constants
 from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
 from sky.utils import common_utils
 from sky.utils import resources_utils
 from sky.utils import subprocess_utils
@@ -95,6 +98,64 @@
     'sleep 10; s=$(sky jobs queue);'
     'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
 
+_WAIT_UNTIL_CLUSTER_STATUS_IS = (
+    # A while loop to wait until the cluster status
+    # becomes certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky status {cluster_name} --refresh | '
+    'awk "/^{cluster_name}/ '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); '
+    'if [ "$current_status" == "{cluster_status}" ]; '
+    'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; '
+    'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; '
+    'sleep 30; '
+    'done')
+
+_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
+    # A while loop to wait until the cluster is not found or timeout
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; '
+    'fi; '
+    'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
+    '  echo "Cluster {cluster_name} successfully removed."; break; '
+    'fi; '
+    'echo "Waiting for cluster {name} to be removed..."; '
+    'sleep 15; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS = (
+    # A while loop to wait until the job status
+    # contains certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky queue {cluster_name} | '
+    'awk "/{job_name}/ '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); '
+    'found=0; '  # Initialize found variable outside the loop
+    'while read -r line; do '  # Read line by line
+    '  if [ "$line" == "{job_status}" ]; then '  # Check each line
+    '    echo "Target job status \'{job_status}\' reached."; '
+    '    found=1; '
+    '    break; '  # Break inner loop
+    '  fi; '
+    'done <<< "$current_status"; '
+    'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
+    'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; '
+    'sleep 15; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace(
+    'awk "/{job_name}/', 'awk "')
+
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
 
@@ -399,7 +460,6 @@ def test_launch_fast_with_autostop(generic_cloud: str):
     # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
     # the VM is stopped.
     autostop_timeout = 600 if generic_cloud == 'azure' else 250
-
     test = Test(
         'test_launch_fast_with_autostop',
         [
@@ -407,10 +467,12 @@ def test_launch_fast_with_autostop(generic_cloud: str):
             f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 1 --status',
             f'sky status -r {name} | grep UP',
-            f'sleep {autostop_timeout}',
 
             # Ensure cluster is stopped
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
 
             # Launch again. Do full output validation - we expect the cluster to re-launch
             f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
@@ -808,7 +870,10 @@ def test_clone_disk_aws():
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
-            'sleep 60',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=60),
             f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
             f'sky logs {name}-clone 1 --status',
@@ -854,8 +919,8 @@ def test_gcp_mig():
             # Check MIG exists.
             f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
             f'sky autostop -i 0 --down -y {name}',
-            'sleep 120',
-            f'sky status -r {name}; sky status {name} | grep "{name} not found"',
+            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
+                                                    timeout=120),
             f'gcloud compute instance-templates list | grep "sky-it-{name}"',
             # Launch again with the same region. The original instance template
             # should be removed.
@@ -922,8 +987,10 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        'sleep 60',
-        f'sky status -r {name} | grep "STOPPED"',
+        _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            cluster_name=name,
+            cluster_status=ClusterStatus.STOPPED.value,
+            timeout=80),
         f'sky start -y {name}',
         f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
@@ -944,7 +1011,10 @@ def test_stale_job(generic_cloud: str):
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            'sleep 100',  # Ensure this is large enough, else GCP leaks.
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=100),
             f'sky start {name} -y',
             f'sky logs {name} 1 --status',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
@@ -972,13 +1042,18 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            'sleep 40',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
             f'sky launch -c {name} -y "echo hi"',
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
         ],
         f'sky down -y {name}',
     )

From 7191844ae2a7466897c75fc42ed9c116936a0db5 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 11 Nov 2024 17:14:31 +0800
Subject: [PATCH 02/64] more event based smoke test

---
 tests/test_smoke.py | 47 ++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index b51e720e84a..a11ff9d8ed8 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -108,11 +108,11 @@
     'fi; '
     'current_status=$(sky status {cluster_name} --refresh | '
     'awk "/^{cluster_name}/ '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); '
-    'if [ "$current_status" == "{cluster_status}" ]; '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); '
+    'if [[ "$current_status" =~ {cluster_status} ]]; '
     'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; '
     'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; '
-    'sleep 30; '
+    'sleep 15; '
     'done')
 
 _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
@@ -129,7 +129,7 @@
     'sleep 15; '
     'done')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS = (
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
     # A while loop to wait until the job status
     # contains certain status, with timeout.
     'start_time=$SECONDS; '
@@ -138,7 +138,7 @@
     '  echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; '
     'fi; '
     'current_status=$(sky queue {cluster_name} | '
-    'awk "/{job_name}/ '
+    'awk "\\$1 == \\"{job_id}\\" '
     '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); '
     'found=0; '  # Initialize found variable outside the loop
     'while read -r line; do '  # Read line by line
@@ -153,8 +153,11 @@
     'sleep 15; '
     'done')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS.replace(
-    'awk "/{job_name}/', 'awk "')
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_name}\\"', 'awk "')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
@@ -1083,8 +1086,10 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
         ],
         f'sky down -y {name}',
     )
@@ -1888,10 +1893,15 @@ def test_multi_echo(generic_cloud: str):
         'multi_echo',
         [
             f'python examples/multi_echo.py {name} {generic_cloud}',
-            'sleep 120',
         ] +
         # Ensure jobs succeeded.
-        [f'sky logs {name} {i + 1} --status' for i in range(32)] +
+        [
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+                cluster_name=name,
+                job_id=i + 1,
+                job_status=JobStatus.SUCCEEDED.value,
+                timeout=120) for i in range(32)
+        ] +
         # Ensure monitor/autoscaler didn't crash on the 'assert not
         # unfulfilled' error.  If process not found, grep->ssh returns 1.
         [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
@@ -1984,7 +1994,8 @@ def test_tpu():
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
             f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"',  # Ensure sky launch won't create another TPU.
         ],
-        f'sky down -y {name}',
+        'echo "hello"',
+        #f'sky down -y {name}',
         timeout=30 * 60,  # can take >20 mins
     )
     run_one_test(test)
@@ -2444,12 +2455,18 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            f'sleep 20',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            'sleep 180',
-            f'sky status -r {name} | grep "INIT\|STOPPED"',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=200),
         ],
         f'sky down -y {name}',
     )

From 5cbebebae882ff172917727b4aa00ab767bd986e Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 11 Nov 2024 17:56:37 +0800
Subject: [PATCH 03/64] more test cases

---
 tests/test_smoke.py | 53 ++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index a11ff9d8ed8..9c422cda194 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -108,10 +108,10 @@
     'fi; '
     'current_status=$(sky status {cluster_name} --refresh | '
     'awk "/^{cluster_name}/ '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^{cluster_status}$/) print \$i}}"); '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); '
     'if [[ "$current_status" =~ {cluster_status} ]]; '
-    'then echo "Target cluster status \'{cluster_status}\' reached."; break; fi; '
-    'echo "Waiting for cluster status to become \'{cluster_status}\', current status: $current_status"; '
+    'then echo "Target cluster status {cluster_status} reached."; break; fi; '
+    'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
     'sleep 15; '
     'done')
 
@@ -143,21 +143,21 @@
     'found=0; '  # Initialize found variable outside the loop
     'while read -r line; do '  # Read line by line
     '  if [ "$line" == "{job_status}" ]; then '  # Check each line
-    '    echo "Target job status \'{job_status}\' reached."; '
+    '    echo "Target job status {job_status} reached."; '
     '    found=1; '
     '    break; '  # Break inner loop
     '  fi; '
     'done <<< "$current_status"; '
     'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
-    'echo "Waiting for job status to contains \'{job_status}\', current status: $current_status"; '
+    'echo "Waiting for job status to contains {job_status}, current status: $current_status"; '
     'sleep 15; '
     'done')
 
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
-    'awk "\\$1 == \\"{job_name}\\"', 'awk "')
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "')
 
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
-    'awk "\\$1 == \\"{job_name}\\"', 'awk "\\$2 == \\"{job_name}\\"')
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
@@ -2489,9 +2489,12 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            'sleep 260',
-            f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"'
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=280) +
+            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
         ],
         f'sky down -y {name}',
         timeout=30 * 60,  # 30 mins
@@ -2527,8 +2530,10 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
 
             # Ensure the cluster is UP and the autostop setting is reset ('-').
             f'sky start -y {name}',
@@ -2544,8 +2549,10 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
 
             # Test restarting the idleness timer via exec:
             f'sky start -y {name}',
@@ -2555,8 +2562,10 @@ def test_autostop(generic_cloud: str):
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
         ],
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
@@ -2773,15 +2782,19 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            'sleep 90',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=90),
             f'sky start {name} -y',
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            'sleep 120',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=120),
         ],
         f'sky down -y {name}',
     )

From 6f6840901b8407be2e20d9093565813029b2f83e Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 11 Nov 2024 18:43:20 +0800
Subject: [PATCH 04/64] more test cases with managed jobs

---
 tests/test_smoke.py | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 9c422cda194..339d7062b0a 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -98,6 +98,8 @@
     'sleep 10; s=$(sky jobs queue);'
     'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
 
+# Cluster functions
+
 _WAIT_UNTIL_CLUSTER_STATUS_IS = (
     # A while loop to wait until the cluster status
     # becomes certain status, with timeout.
@@ -142,7 +144,7 @@
     '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); '
     'found=0; '  # Initialize found variable outside the loop
     'while read -r line; do '  # Read line by line
-    '  if [ "$line" == "{job_status}" ]; then '  # Check each line
+    '  if [[ "$line" =~ {job_status} ]]; then '  # Check each line
     '    echo "Target job status {job_status} reached."; '
     '    found=1; '
     '    break; '  # Break inner loop
@@ -153,12 +155,18 @@
     'sleep 15; '
     'done')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "')
 
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
+# Managed job functions
+
+_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
+    'sky queue {cluster_name}',
+    'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ')
+
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
 
@@ -1053,7 +1061,7 @@ def test_aws_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format(
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
                 cluster_name=name,
                 job_status=JobStatus.FAILED.value,
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
@@ -1086,7 +1094,7 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB_NAME.format(
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
                 cluster_name=name,
                 job_status=JobStatus.FAILED.value,
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
@@ -2814,14 +2822,21 @@ def test_managed_jobs(generic_cloud: str):
         [
             f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-1',
+                job_status=
+                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                timeout=60),
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=
+                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                timeout=60),
             f'sky jobs cancel -y -n {name}-1',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep CANCELLED',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=f'{JobStatus.CANCELLED.value}',
+                timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
             f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
@@ -2891,9 +2906,11 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
         'managed_jobs_failed_setup',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
-            'sleep 330',
             # Make sure the job failed quickly.
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                timeout=330),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.

From 1f67691aec7a6d66cf7733190e7ce5a142c361cb Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 11 Nov 2024 18:58:41 +0800
Subject: [PATCH 05/64] bug fix

---
 tests/test_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 339d7062b0a..043cb63ea96 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -2834,7 +2834,7 @@ def test_managed_jobs(generic_cloud: str):
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
+                job_name=f'{name}-1',
                 job_status=f'{JobStatus.CANCELLED.value}',
                 timeout=230),
             # Test the functionality for logging.

From be7964ece6275ca782c17d50c5f8db5187cf9bfd Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Wed, 13 Nov 2024 16:41:13 +0800
Subject: [PATCH 06/64] bump up seconds

---
 tests/test_smoke.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index e6daae0e588..7d415708cfc 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -114,7 +114,7 @@
     'if [[ "$current_status" =~ {cluster_status} ]]; '
     'then echo "Target cluster status {cluster_status} reached."; break; fi; '
     'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
-    'sleep 15; '
+    'sleep 10; '
     'done')
 
 _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
@@ -128,7 +128,7 @@
     '  echo "Cluster {cluster_name} successfully removed."; break; '
     'fi; '
     'echo "Waiting for cluster {name} to be removed..."; '
-    'sleep 15; '
+    'sleep 10; '
     'done')
 
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
@@ -152,7 +152,7 @@
     'done <<< "$current_status"; '
     'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
     'echo "Waiting for job status to contains {job_status}, current status: $current_status"; '
-    'sleep 15; '
+    'sleep 10; '
     'done')
 
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
@@ -167,6 +167,11 @@
     'sky queue {cluster_name}',
     'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ')
 
+# After the timeout, the cluster will stop if autostop is set, and our check
+# should be more than the timeout. To address this, we extend the timeout by
+# _BUMP_UP_SECONDS before exiting.
+_BUMP_UP_SECONDS = 35
+
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
 
@@ -2043,8 +2048,7 @@ def test_tpu():
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
             f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"',  # Ensure sky launch won't create another TPU.
         ],
-        'echo "hello"',
-        #f'sky down -y {name}',
+        f'sky down -y {name}',
         timeout=30 * 60,  # can take >20 mins
     )
     run_one_test(test)
@@ -2614,7 +2618,7 @@ def test_autostop(generic_cloud: str):
             _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
-                timeout=autostop_timeout),
+                timeout=autostop_timeout + _BUMP_UP_SECONDS),
         ],
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
@@ -2951,7 +2955,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=name,
                 job_status=f'{JobStatus.FAILED_SETUP.value}',
-                timeout=330),
+                timeout=330 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.

From c464005216903f92e91cb7ca946318c31d50b33a Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Sat, 16 Nov 2024 00:09:48 +0800
Subject: [PATCH 07/64] merge master and resolve conflict

---
 tests/test_smoke.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index e254f6a0870..5aeb1f055fe 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -99,6 +99,8 @@
     'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
 
 # Cluster functions
+_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
+_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
 
 _WAIT_UNTIL_CLUSTER_STATUS_IS = (
     # A while loop to wait until the cluster status
@@ -110,7 +112,8 @@
     'fi; '
     'current_status=$(sky status {cluster_name} --refresh | '
     'awk "/^{cluster_name}/ '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|UP|STOPPED)$/) print \$i}}"); '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
+    ')$/) print \$i}}"); '
     'if [[ "$current_status" =~ {cluster_status} ]]; '
     'then echo "Target cluster status {cluster_status} reached."; break; fi; '
     'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
@@ -141,7 +144,8 @@
     'fi; '
     'current_status=$(sky queue {cluster_name} | '
     'awk "\\$1 == \\"{job_id}\\" '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(INIT|PENDING|SETTING_UP|RUNNING|SUCCEEDED|FAILED|FAILED_SETUP|CANCELLED)$/) print \$i}}"); '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
+    ')$/) print \$i}}"); '
     'found=0; '  # Initialize found variable outside the loop
     'while read -r line; do '  # Read line by line
     '  if [[ "$line" =~ {job_status} ]]; then '  # Check each line

From c054edf56499a39ed42e1e62fba66b5f81411551 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 13:42:24 +0800
Subject: [PATCH 08/64] more test case

---
 tests/test_smoke.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 5aeb1f055fe..434f0099b12 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -120,6 +120,11 @@
     'sleep 10; '
     'done')
 
+_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace(
+    'sky status {cluster_name}',
+    'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
+                                           'awk "/^{cluster_name_awk}/')
+
 _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
     # A while loop to wait until the cluster is not found or timeout
     'start_time=$SECONDS; '
@@ -530,6 +535,7 @@ def test_aws_region():
 @pytest.mark.aws
 def test_aws_with_ssh_proxy_command():
     name = _get_cluster_name()
+
     with tempfile.NamedTemporaryFile(mode='w') as f:
         f.write(
             textwrap.dedent(f"""\
@@ -551,10 +557,18 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
-                'timeout 300s bash -c "until sky status sky-jobs-controller* | grep UP; do sleep 1; done"',
+                _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format(
+                    cluster_name=f'sky-jobs-controller-*',
+                    cluster_name_awk='sky-jobs-controller-.*',
+                    cluster_status=ClusterStatus.UP.value,
+                    timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
-                'sleep 300',
-                f'{_GET_JOB_QUEUE} | grep {name} | grep "STARTING\|RUNNING\|SUCCEEDED"',
+                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
+                format(
+                    job_name=name,
+                    job_status=
+                    f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})',
+                    timeout=300),
             ],
             f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
         )
@@ -1817,6 +1831,7 @@ def test_large_job_queue(generic_cloud: str):
             f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done',
             f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16',
             'sleep 90',
+
             # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running.
             # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING.
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43',

From 8675df39250be8db57593ad7e4d99ca1e6b13a24 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 16:44:15 +0800
Subject: [PATCH 09/64] support test_managed_jobs_pipeline_failed_setup

---
 tests/test_smoke.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 434f0099b12..0b86aaa7227 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -173,8 +173,9 @@
 # Managed job functions
 
 _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
-    'sky queue {cluster_name}',
-    'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ')
+    'sky queue {cluster_name}', 'sky jobs queue').replace(
+        'awk "\\$2 == \\"{job_name}\\"',
+        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"')
 
 # After the timeout, the cluster will stop if autostop is set, and our check
 # should be more than the timeout. To address this, we extend the timeout by
@@ -3021,7 +3022,10 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
         'managed_jobs_pipeline_failed_setup',
         [
             f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            'sleep 600',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                timeout=600),
             # Make sure the job failed quickly.
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
             # Task 0 should be SUCCEEDED.

From 7e7c055d1b74464021f7b88b4daf1cfd46d4b9e5 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 17:08:34 +0800
Subject: [PATCH 10/64] support test_managed_jobs_recovery_aws

---
 tests/test_smoke.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 0b86aaa7227..b22643ec439 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -3059,8 +3059,8 @@ def test_managed_jobs_recovery_aws(aws_config_region):
         'managed_jobs_recovery_aws',
         [
             f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name, job_status=JobStatus.RUNNING.value, timeout=600),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -3070,8 +3070,8 @@ def test_managed_jobs_recovery_aws(aws_config_region):
              '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name, job_status=JobStatus.RUNNING.value, timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',

From f631cd3151eab76e2b04bddf930372fbf7daa27a Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 17:55:16 +0800
Subject: [PATCH 11/64] manged job status

---
 tests/test_smoke.py | 69 ++++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index b22643ec439..d3f0e0b6adc 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -59,6 +59,7 @@
 from sky.data import data_utils
 from sky.data import storage as storage_lib
 from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
 from sky.skylet import events
 from sky.skylet.job_lib import JobStatus
@@ -101,6 +102,8 @@
 # Cluster functions
 _ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
 _ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
+_ALL_MANAGED_JOB_STATUSES = "|".join(
+    [status.value for status in ManagedJobStatus])
 
 _WAIT_UNTIL_CLUSTER_STATUS_IS = (
     # A while loop to wait until the cluster status
@@ -175,7 +178,8 @@
 _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
     'sky queue {cluster_name}', 'sky jobs queue').replace(
         'awk "\\$2 == \\"{job_name}\\"',
-        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"')
+        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
+            _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES)
 
 # After the timeout, the cluster will stop if autostop is set, and our check
 # should be more than the timeout. To address this, we extend the timeout by
@@ -568,7 +572,7 @@ def test_aws_with_ssh_proxy_command():
                 format(
                     job_name=name,
                     job_status=
-                    f'({JobStatus.SUCCEEDED.value}|{JobStatus.RUNNING.value})',
+                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})',
                     timeout=300),
             ],
             f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
@@ -2914,17 +2918,17 @@ def test_managed_jobs(generic_cloud: str):
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=f'{name}-1',
                 job_status=
-                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
                 timeout=60),
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=f'{name}-2',
                 job_status=
-                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=f'{name}-1',
-                job_status=f'{JobStatus.CANCELLED.value}',
+                job_status=f'{ManagedJobStatus.CANCELLED.value}',
                 timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
@@ -2998,7 +3002,7 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
             # Make sure the job failed quickly.
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=name,
-                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
                 timeout=330 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3024,7 +3028,7 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
             f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                 job_name=name,
-                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
                 timeout=600),
             # Make sure the job failed quickly.
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
@@ -3060,7 +3064,9 @@ def test_managed_jobs_recovery_aws(aws_config_region):
         [
             f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name, job_status=JobStatus.RUNNING.value, timeout=600),
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=600),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -3071,7 +3077,9 @@ def test_managed_jobs_recovery_aws(aws_config_region):
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
             _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name, job_status=JobStatus.RUNNING.value, timeout=200),
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3099,15 +3107,19 @@ def test_managed_jobs_recovery_gcp():
         'managed_jobs_recovery_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3130,8 +3142,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
         'managed_jobs_pipeline_recovery_aws',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
@@ -3150,8 +3164,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
                 '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
@@ -3181,8 +3197,10 @@ def test_managed_jobs_pipeline_recovery_gcp():
         'managed_jobs_pipeline_recovery_gcp',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
@@ -3193,8 +3211,10 @@ def test_managed_jobs_pipeline_recovery_gcp():
              f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
@@ -3220,8 +3240,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str):
         'managed-spot-recovery-default-resources',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|RECOVERING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})',
+                timeout=360),
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,

From d822c4b1ee53fa64849343cdf62c27a5df017ba9 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 17:58:28 +0800
Subject: [PATCH 12/64] bug fix

---
 tests/test_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index d3f0e0b6adc..799ff805faf 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -572,7 +572,7 @@ def test_aws_with_ssh_proxy_command():
                 format(
                     job_name=name,
                     job_status=
-                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value})',
+                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})',
                     timeout=300),
             ],
             f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',

From 9d8194e33ec88649f862ccb5ba041a086dfab857 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 18:16:24 +0800
Subject: [PATCH 13/64] test managed job cancel

---
 tests/test_smoke.py | 110 +++++++++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 42 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 799ff805faf..8792b106ea8 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -3264,8 +3264,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
         'managed_jobs_recovery_multi_node_aws',
         [
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 450',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=450),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -3276,8 +3278,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
              '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 560',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3305,15 +3309,19 @@ def test_managed_jobs_recovery_multi_node_gcp():
         'managed_jobs_recovery_multi_node_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 420',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3338,13 +3346,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
-            'sleep 60',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})',
+                timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -3352,12 +3363,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             ),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            'sleep 300',
+            # The job is set up in the cluster, will shown as RUNNING.
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -3365,8 +3380,11 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             ),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            'sleep 300',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"',
+            # The job is running in the cluster, will shown as RUNNING.
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -3376,10 +3394,10 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$(aws ec2 describe-instances --region {region} '
@@ -3414,34 +3432,42 @@ def test_managed_jobs_cancellation_gcp():
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
-            'sleep 60',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.STARTING.value,
+                timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            'sleep 300',
+            # The job is set up in the cluster, will shown as RUNNING.
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            'sleep 300',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'

From 41dfbee2c5bc7d3ef90cb524e3c8e7911c6b63ad Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Mon, 18 Nov 2024 18:28:25 +0800
Subject: [PATCH 14/64] test_managed_jobs_storage

---
 tests/test_smoke.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 8792b106ea8..21b2c70cfbf 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -3557,8 +3557,10 @@ def test_managed_jobs_storage(generic_cloud: str):
                 *STORAGE_SETUP_COMMANDS,
                 f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
                 region_validation_cmd,  # Check if the bucket is created in the correct region
-                'sleep 60',  # Wait the spot queue to be updated
-                f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED',
+                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
+                format(job_name=name,
+                       job_status=ManagedJobStatus.SUCCEEDED.value,
+                       timeout=60 + _BUMP_UP_SECONDS),
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
                 output_check_cmd

From 6a13540d3134c5ba5f4d648e9c37e8a111f7a6f9 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Tue, 19 Nov 2024 11:11:15 +0800
Subject: [PATCH 15/64] more test cases

---
 tests/test_smoke.py | 59 +++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 21b2c70cfbf..bf1178a6629 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -105,7 +105,7 @@
 _ALL_MANAGED_JOB_STATUSES = "|".join(
     [status.value for status in ManagedJobStatus])
 
-_WAIT_UNTIL_CLUSTER_STATUS_IS = (
+_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
     # A while loop to wait until the cluster status
     # becomes certain status, with timeout.
     'start_time=$SECONDS; '
@@ -123,7 +123,7 @@
     'sleep 10; '
     'done')
 
-_WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_IS.replace(
+_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
     'sky status {cluster_name}',
     'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
                                            'awk "/^{cluster_name_awk}/')
@@ -499,7 +499,7 @@ def test_launch_fast_with_autostop(generic_cloud: str):
             f'sky status -r {name} | grep UP',
 
             # Ensure cluster is stopped
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=autostop_timeout),
@@ -562,7 +562,7 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
-                _WAIT_UNTIL_CLUSTER_STATUS_IS_WILDCARD.format(
+                _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format(
                     cluster_name=f'sky-jobs-controller-*',
                     cluster_name_awk='sky-jobs-controller-.*',
                     cluster_status=ClusterStatus.UP.value,
@@ -943,7 +943,7 @@ def test_clone_disk_aws():
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=60),
@@ -1060,7 +1060,7 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
             cluster_name=name,
             cluster_status=ClusterStatus.STOPPED.value,
             timeout=80),
@@ -1084,7 +1084,7 @@ def test_stale_job(generic_cloud: str):
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=100),
@@ -1115,7 +1115,7 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=40),
@@ -2556,14 +2556,14 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=
                 f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
@@ -2590,7 +2590,7 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=
                 f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
@@ -2631,7 +2631,7 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=autostop_timeout),
@@ -2650,7 +2650,7 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=autostop_timeout),
@@ -2663,7 +2663,7 @@ def test_autostop(generic_cloud: str):
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=autostop_timeout + _BUMP_UP_SECONDS),
@@ -2883,7 +2883,7 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=90),
@@ -2892,7 +2892,7 @@ def test_stop_gcp_spot():
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=120),
@@ -3584,10 +3584,16 @@ def test_managed_jobs_tpu():
         'test-spot-tpu',
         [
             f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep STARTING',
-            'sleep 900',  # TPU takes a while to launch
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.STARTING.value,
+                timeout=60 + _BUMP_UP_SECONDS),
+            # TPU takes a while to launch
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})',
+                timeout=900 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -3605,8 +3611,10 @@ def test_managed_jobs_inline_env(generic_cloud: str):
         'test-managed-jobs-inline-env',
         [
             f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            'sleep 20',
-            f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.SUCCEEDED.value,
+                timeout=20 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -3713,8 +3721,11 @@ def test_azure_start_stop_two_nodes():
             f'sky start -y {name} -i 1',
             f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            'sleep 200',
-            f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"'
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})',
+                timeout=200 + _BUMP_UP_SECONDS),
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
         ],
         f'sky down -y {name}',

From d83647fe1b897b5317bf42096a001b74d5db18e2 Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Tue, 19 Nov 2024 18:23:33 +0800
Subject: [PATCH 16/64] resolve pr comment

---
 tests/test_smoke.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index bf1178a6629..53a6e517b3b 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -123,10 +123,19 @@
     'sleep 10; '
     'done')
 
-_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
-    'sky status {cluster_name}',
-    'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
-                                           'awk "/^{cluster_name_awk}/')
+
+def get_cmd_wait_until_cluster_status_contains_wildcard(
+        cluster_name_wildcard: str, cluster_status: str, timeout: int):
+    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
+        'sky status {cluster_name}',
+        'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
+                                               'awk "/^{cluster_name_awk}/')
+    return wait_cmd.format(cluster_name=cluster_name_wildcard,
+                           cluster_name_awk=cluster_name_wildcard.replace(
+                               '*', '.*'),
+                           cluster_status=cluster_status,
+                           timeout=timeout)
+
 
 _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
     # A while loop to wait until the cluster is not found or timeout
@@ -562,9 +571,8 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
-                _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS_WILDCARD.format(
-                    cluster_name=f'sky-jobs-controller-*',
-                    cluster_name_awk='sky-jobs-controller-.*',
+                get_cmd_wait_until_cluster_status_contains_wildcard(
+                    cluster_name_wildcard='sky-jobs-controller-*',
                     cluster_status=ClusterStatus.UP.value,
                     timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',

From 573e83efb3a1c73e52720535911cf043f4d8857e Mon Sep 17 00:00:00 2001
From: zepingguo <zguo@covariant.ai>
Date: Tue, 19 Nov 2024 18:29:27 +0800
Subject: [PATCH 17/64] private member function

---
 tests/test_smoke.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 53a6e517b3b..8e54e9856a9 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -124,7 +124,7 @@
     'done')
 
 
-def get_cmd_wait_until_cluster_status_contains_wildcard(
+def _get_cmd_wait_until_cluster_status_contains_wildcard(
         cluster_name_wildcard: str, cluster_status: str, timeout: int):
     wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
         'sky status {cluster_name}',
@@ -571,7 +571,7 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
-                get_cmd_wait_until_cluster_status_contains_wildcard(
+                _get_cmd_wait_until_cluster_status_contains_wildcard(
                     cluster_name_wildcard='sky-jobs-controller-*',
                     cluster_status=ClusterStatus.UP.value,
                     timeout=300),

From 1202d1a5637bb31c7c97fd86c2ac0e105763bc1d Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 21 Nov 2024 14:16:03 +0800
Subject: [PATCH 18/64] bug fix

---
 tests/test_smoke.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index fdc83fb2192..a629816cb22 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -3735,7 +3735,7 @@ def test_azure_start_stop_two_nodes():
                 cluster_name=name,
                 cluster_status=
                 f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})',
-                timeout=200 + _BUMP_UP_SECONDS),
+                timeout=200 + _BUMP_UP_SECONDS) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
         ],
         f'sky down -y {name}',
@@ -4746,7 +4746,10 @@ def test_core_api_sky_launch_fast(generic_cloud: str):
                    idle_minutes_to_autostop=1,
                    fast=True)
         # Sleep to let the cluster autostop
-        time.sleep(120)
+        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            cluster_name=name,
+            cluster_status=ClusterStatus.STOPPED,
+            timeout=120)
         # Run it again - should work with fast=True
         sky.launch(task,
                    cluster_name=name,

From 87d7f1248730e0f2921ba1fd9dc558d22ddb4554 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 21 Nov 2024 16:55:16 +0800
Subject: [PATCH 19/64] restructure

---
 tests/smoke_tests/__init__.py             |    2 +
 tests/smoke_tests/test_basic.py           |  206 ++++
 tests/smoke_tests/test_images.py          |  472 ++++++++++
 tests/smoke_tests/test_region_and_zone.py |  267 ++++++
 tests/{ => smoke_tests}/test_smoke.py     | 1035 +--------------------
 tests/smoke_tests/util.py                 |  381 ++++++++
 6 files changed, 1349 insertions(+), 1014 deletions(-)
 create mode 100644 tests/smoke_tests/__init__.py
 create mode 100644 tests/smoke_tests/test_basic.py
 create mode 100644 tests/smoke_tests/test_images.py
 create mode 100644 tests/smoke_tests/test_region_and_zone.py
 rename tests/{ => smoke_tests}/test_smoke.py (83%)
 create mode 100644 tests/smoke_tests/util.py

diff --git a/tests/smoke_tests/__init__.py b/tests/smoke_tests/__init__.py
new file mode 100644
index 00000000000..7f91740c201
--- /dev/null
+++ b/tests/smoke_tests/__init__.py
@@ -0,0 +1,2 @@
+"""For smoke tests import."""
+__all__ = ['util']
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
new file mode 100644
index 00000000000..9d8a1225e42
--- /dev/null
+++ b/tests/smoke_tests/test_basic.py
@@ -0,0 +1,206 @@
+# Smoke tests for SkyPilot
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/test_smoke.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/test_smoke.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/test_smoke.py::test_minimal
+#
+# Only run managed job tests
+# > pytest tests/test_smoke.py --managed-jobs
+#
+# Only run sky serve tests
+# > pytest tests/test_smoke.py --sky-serve
+#
+# Only run test for AWS + generic tests
+# > pytest tests/test_smoke.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/test_smoke.py --generic-cloud aws
+
+import enum
+import inspect
+import json
+import os
+import pathlib
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from typing import Dict, List, NamedTuple, Optional, Tuple
+import urllib.parse
+import uuid
+
+import colorama
+import jinja2
+import pytest
+from smoke_tests.util import _get_cluster_name
+from smoke_tests.util import (
+    _get_cmd_wait_until_cluster_status_contains_wildcard)
+from smoke_tests.util import _GET_JOB_QUEUE
+from smoke_tests.util import _get_timeout
+from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
+from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+from smoke_tests.util import (
+    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
+from smoke_tests.util import (
+    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+from smoke_tests.util import FLUIDSTACK_TYPE
+from smoke_tests.util import LAMBDA_TYPE
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_GPU_V100
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
+
+import sky
+from sky import global_user_state
+from sky import jobs
+from sky import serve
+from sky import skypilot_config
+from sky.adaptors import azure
+from sky.adaptors import cloudflare
+from sky.adaptors import ibm
+from sky.clouds import AWS
+from sky.clouds import Azure
+from sky.clouds import GCP
+from sky.data import data_utils
+from sky.data import storage as storage_lib
+from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
+from sky.skylet import constants
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
+from sky.utils import resources_utils
+from sky.utils import subprocess_utils
+
+
+# ---------- Dry run: 2 Tasks in a chain. ----------
+@pytest.mark.no_fluidstack  #requires GCP and AWS set up
+def test_example_app():
+    test = Test(
+        'example_app',
+        ['python examples/example_app.py'],
+    )
+    run_one_test(test)
+
+
+# ---------- A minimal task ----------
+def test_minimal(generic_cloud: str):
+    name = _get_cluster_name()
+    test = Test(
+        'minimal',
+        [
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            # Output validation done.
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
+            # Test launch output again on existing cluster
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
+            # Check the logs downloading
+            f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log',
+            # Ensure the raylet process has the correct file descriptor limit.
+            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+            # Install jq for the next test.
+            f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'',
+            # Check the cluster info
+            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'',
+            f'sky logs {name} 5 --status',  # Ensure the job succeeded.
+            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
+            f'sky logs {name} 6 --status',  # Ensure the job succeeded.
+            # Test '-c' for exec
+            f'sky exec -c {name} echo',
+            f'sky logs {name} 7 --status',
+            f'sky exec echo -c {name}',
+            f'sky logs {name} 8 --status',
+            f'sky exec -c {name} echo hi test',
+            f'sky logs {name} 9 | grep "hi test"',
+            f'sky exec {name} && exit 1 || true',
+            f'sky exec -c {name} && exit 1 || true',
+        ],
+        f'sky down -y {name}',
+        _get_timeout(generic_cloud),
+    )
+    run_one_test(test)
+
+
+# ---------- Test fast launch ----------
+def test_launch_fast(generic_cloud: str):
+    name = _get_cluster_name()
+
+    test = Test(
+        'test_launch_fast',
+        [
+            # First launch to create the cluster
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'sky logs {name} 1 --status',
+
+            # Second launch to test fast launch - should not reprovision
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && '
+            ' echo "$s" && '
+            # Validate that cluster was not re-launched.
+            '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
+            # Validate that setup was not re-run.
+            '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
+            # Validate that the task ran and finished.
+            'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"',
+            f'sky logs {name} 2 --status',
+            f'sky status -r {name} | grep UP',
+        ],
+        f'sky down -y {name}',
+        timeout=_get_timeout(generic_cloud),
+    )
+    run_one_test(test)
+
+
+# See cloud exclusion explanations in test_autostop
+@pytest.mark.no_fluidstack
+@pytest.mark.no_lambda_cloud
+@pytest.mark.no_ibm
+@pytest.mark.no_kubernetes
+def test_launch_fast_with_autostop(generic_cloud: str):
+    name = _get_cluster_name()
+    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
+    # the VM is stopped.
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    test = Test(
+        'test_launch_fast_with_autostop',
+        [
+            # First launch to create the cluster with a short autostop
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'sky logs {name} 1 --status',
+            f'sky status -r {name} | grep UP',
+
+            # Ensure cluster is stopped
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
+
+            # Launch again. Do full output validation - we expect the cluster to re-launch
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'sky logs {name} 2 --status',
+            f'sky status -r {name} | grep UP',
+        ],
+        f'sky down -y {name}',
+        timeout=_get_timeout(generic_cloud) + autostop_timeout,
+    )
+    run_one_test(test)
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
new file mode 100644
index 00000000000..42438461f76
--- /dev/null
+++ b/tests/smoke_tests/test_images.py
@@ -0,0 +1,472 @@
+# Smoke tests for SkyPilot
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/test_smoke.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/test_smoke.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/test_smoke.py::test_minimal
+#
+# Only run managed job tests
+# > pytest tests/test_smoke.py --managed-jobs
+#
+# Only run sky serve tests
+# > pytest tests/test_smoke.py --sky-serve
+#
+# Only run test for AWS + generic tests
+# > pytest tests/test_smoke.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/test_smoke.py --generic-cloud aws
+
+import enum
+import inspect
+import json
+import os
+import pathlib
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from typing import Dict, List, NamedTuple, Optional, Tuple
+import urllib.parse
+import uuid
+
+import colorama
+import jinja2
+import pytest
+from smoke_tests.util import _get_cluster_name
+from smoke_tests.util import (
+    _get_cmd_wait_until_cluster_status_contains_wildcard)
+from smoke_tests.util import _GET_JOB_QUEUE
+from smoke_tests.util import _get_timeout
+from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
+from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+from smoke_tests.util import (
+    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
+from smoke_tests.util import (
+    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+from smoke_tests.util import FLUIDSTACK_TYPE
+from smoke_tests.util import LAMBDA_TYPE
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_GPU_V100
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
+
+import sky
+from sky import global_user_state
+from sky import jobs
+from sky import serve
+from sky import skypilot_config
+from sky.adaptors import azure
+from sky.adaptors import cloudflare
+from sky.adaptors import ibm
+from sky.clouds import AWS
+from sky.clouds import Azure
+from sky.clouds import GCP
+from sky.data import data_utils
+from sky.data import storage as storage_lib
+from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
+from sky.skylet import constants
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
+from sky.utils import resources_utils
+from sky.utils import subprocess_utils
+
+
+# ---------- Test the image ----------
+@pytest.mark.aws
+def test_aws_images():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_images',
+        [
+            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true',
+            f'sky launch -y -c {name} examples/minimal.yaml',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_images():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_images',
+        [
+            f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true',
+            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_images():
+    name = _get_cluster_name()
+    test = Test(
+        'azure_images',
+        [
+            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true',
+            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_image_id_dict():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_image_id_dict',
+        [
+            # Use image id dict.
+            f'sky launch -y -c {name} examples/per_region_images.yaml',
+            f'sky exec {name} examples/per_region_images.yaml',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_image_id_dict():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_image_id_dict',
+        [
+            # Use image id dict.
+            f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_image_id_dict_region():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_image_id_dict_region',
+        [
+            # YAML has
+            #   image_id:
+            #       us-west-2: skypilot:gpu-ubuntu-1804
+            #       us-east-2: skypilot:gpu-ubuntu-2004
+            # Use region to filter image_id dict.
+            f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true',
+            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
+            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
+            # Should success because the image id match for the region.
+            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
+            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
+            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
+            # Ensure exec works.
+            f'sky exec {name} --region us-east-2 examples/per_region_images.yaml',
+            f'sky exec {name} examples/per_region_images.yaml',
+            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 4 --status',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_image_id_dict_region():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_image_id_dict_region',
+        [
+            # Use region to filter image_id dict.
+            f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
+            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
+            f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
+            # Should success because the image id match for the region.
+            f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
+            f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
+            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+            f'sky status --all | grep {name} | grep us-west3',  # Ensure the region is correct.
+            # Ensure exec works.
+            f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} --cloud gcp --region us-west3 "ls ~"',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 4 --status',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_image_id_dict_zone():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_image_id_dict_zone',
+        [
+            # YAML has
+            #   image_id:
+            #       us-west-2: skypilot:gpu-ubuntu-1804
+            #       us-east-2: skypilot:gpu-ubuntu-2004
+            # Use zone to filter image_id dict.
+            f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true',
+            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
+            f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml',
+            # Should success because the image id match for the zone.
+            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
+            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
+            # Fail due to image id mismatch.
+            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+            f'sky status --all | grep {name} | grep us-east-2a',  # Ensure the zone is correct.
+            # Ensure exec works.
+            f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml',
+            f'sky exec {name} examples/per_region_images.yaml',
+            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 4 --status',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_image_id_dict_zone():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_image_id_dict_zone',
+        [
+            # Use zone to filter image_id dict.
+            f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
+            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
+            f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
+            # Should success because the image id match for the zone.
+            f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
+            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
+            # Fail due to image id mismatch.
+            f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 --status',
+            f'sky status --all | grep {name} | grep us-central1',  # Ensure the zone is correct.
+            # Ensure exec works.
+            f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
+            f'sky exec {name} --cloud gcp --region us-central1 "ls ~"',
+            f'sky exec {name} "ls ~"',
+            f'sky logs {name} 4 --status',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_clone_disk_aws():
+    name = _get_cluster_name()
+    test = Test(
+        'clone_disk_aws',
+        [
+            f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
+            f'sky stop {name} -y',
+            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=60),
+            # Wait for EC2 instance to be in stopped state.
+            # TODO: event based wait.
+            'sleep 60',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
+            f'sky logs {name}-clone 1 --status',
+            f'sky logs {name}-clone-2 1 --status',
+        ],
+        f'sky down -y {name} {name}-clone {name}-clone-2',
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_clone_disk_gcp():
+    name = _get_cluster_name()
+    test = Test(
+        'clone_disk_gcp',
+        [
+            f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
+            f'sky stop {name} -y',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"',
+            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"',
+            f'sky logs {name}-clone 1 --status',
+            f'sky logs {name}-clone-2 1 --status',
+        ],
+        f'sky down -y {name} {name}-clone {name}-clone-2',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_mig():
+    name = _get_cluster_name()
+    region = 'us-central1'
+    test = Test(
+        'gcp_mig',
+        [
+            f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
+            # Check MIG exists.
+            f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
+            f'sky autostop -i 0 --down -y {name}',
+            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
+                                                    timeout=120),
+            f'gcloud compute instance-templates list | grep "sky-it-{name}"',
+            # Launch again with the same region. The original instance template
+            # should be removed.
+            f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi',
+            f'sky logs {name} 1 | grep "L4"',
+            f'sky down -y {name}',
+            f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true',
+        ],
+        f'sky down -y {name}',
+        env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'})
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_force_enable_external_ips():
+    name = _get_cluster_name()
+    test_commands = [
+        f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
+        # Check network of vm is "default"
+        (f'gcloud compute instances list --filter=name~"{name}" --format='
+         '"value(networkInterfaces.network)" | grep "networks/default"'),
+        # Check External NAT in network access configs, corresponds to external ip
+        (f'gcloud compute instances list --filter=name~"{name}" --format='
+         '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"'
+        ),
+        f'sky down -y {name}',
+    ]
+    skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml'
+    test = Test('gcp_force_enable_external_ips',
+                test_commands,
+                f'sky down -y {name}',
+                env={'SKYPILOT_CONFIG': skypilot_config})
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_image_no_conda():
+    name = _get_cluster_name()
+    test = Test(
+        'image_no_conda',
+        [
+            # Use image id dict.
+            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
+            f'sky logs {name} 1 --status',
+            f'sky stop {name} -y',
+            f'sky start {name} -y',
+            f'sky exec {name} examples/per_region_images.yaml',
+            f'sky logs {name} 2 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
+@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
+def test_custom_default_conda_env(generic_cloud: str):
+    name = _get_cluster_name()
+    test = Test('custom_default_conda_env', [
+        f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml',
+        f'sky status -r {name} | grep "UP"',
+        f'sky logs {name} 1 --status',
+        f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"',
+        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
+        f'sky logs {name} 2 --status',
+        f'sky autostop -y -i 0 {name}',
+        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            cluster_name=name,
+            cluster_status=ClusterStatus.STOPPED.value,
+            timeout=80),
+        f'sky start -y {name}',
+        f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
+        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
+        f'sky logs {name} 3 --status',
+    ], f'sky down -y {name}')
+    run_one_test(test)
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
new file mode 100644
index 00000000000..57f84ff4a0e
--- /dev/null
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -0,0 +1,267 @@
+# Smoke tests for SkyPilot
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/test_smoke.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/test_smoke.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/test_smoke.py::test_minimal
+#
+# Only run managed job tests
+# > pytest tests/test_smoke.py --managed-jobs
+#
+# Only run sky serve tests
+# > pytest tests/test_smoke.py --sky-serve
+#
+# Only run test for AWS + generic tests
+# > pytest tests/test_smoke.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/test_smoke.py --generic-cloud aws
+
+import enum
+import inspect
+import json
+import os
+import pathlib
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from typing import Dict, List, NamedTuple, Optional, Tuple
+import urllib.parse
+import uuid
+
+import colorama
+import jinja2
+import pytest
+from smoke_tests.util import _get_cluster_name
+from smoke_tests.util import (
+    _get_cmd_wait_until_cluster_status_contains_wildcard)
+from smoke_tests.util import _GET_JOB_QUEUE
+from smoke_tests.util import _get_timeout
+from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
+from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+from smoke_tests.util import (
+    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
+from smoke_tests.util import (
+    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+from smoke_tests.util import FLUIDSTACK_TYPE
+from smoke_tests.util import LAMBDA_TYPE
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_GPU_V100
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
+
+import sky
+from sky import global_user_state
+from sky import jobs
+from sky import serve
+from sky import skypilot_config
+from sky.adaptors import azure
+from sky.adaptors import cloudflare
+from sky.adaptors import ibm
+from sky.clouds import AWS
+from sky.clouds import Azure
+from sky.clouds import GCP
+from sky.data import data_utils
+from sky.data import storage as storage_lib
+from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
+from sky.skylet import constants
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
+from sky.utils import resources_utils
+from sky.utils import subprocess_utils
+
+
+# ---------- Test region ----------
+@pytest.mark.aws
+def test_aws_region():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_region',
+        [
+            f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml',
+            f'sky exec {name} examples/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            # A user program should not access SkyPilot runtime env python by default.
+            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_with_ssh_proxy_command():
+    name = _get_cluster_name()
+
+    with tempfile.NamedTemporaryFile(mode='w') as f:
+        f.write(
+            textwrap.dedent(f"""\
+        aws:
+            ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name}
+        """))
+        f.flush()
+        test = Test(
+            'aws_with_ssh_proxy_command',
+            [
+                f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1',
+                # Use jump config
+                f'export SKYPILOT_CONFIG={f.name}; '
+                f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi',
+                f'sky logs {name} 1 --status',
+                f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi',
+                f'sky logs {name} 2 --status',
+                # Start a small job to make sure the controller is created.
+                f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
+                # Wait other tests to create the job controller first, so that
+                # the job controller is not launched with proxy command.
+                _get_cmd_wait_until_cluster_status_contains_wildcard(
+                    cluster_name_wildcard='sky-jobs-controller-*',
+                    cluster_status=ClusterStatus.UP.value,
+                    timeout=300),
+                f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
+                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
+                format(
+                    job_name=name,
+                    job_status=
+                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})',
+                    timeout=300),
+            ],
+            f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
+        )
+        run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_region_and_service_account():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_region',
+        [
+            f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml',
+            f'sky exec {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep us-central1',  # Ensure the region is correct.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+            # A user program should not access SkyPilot runtime env python by default.
+            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
+            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.ibm
+def test_ibm_region():
+    name = _get_cluster_name()
+    region = 'eu-de'
+    test = Test(
+        'region',
+        [
+            f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml',
+            f'sky exec {name} --cloud ibm examples/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep {region}',  # Ensure the region is correct.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_region():
+    name = _get_cluster_name()
+    test = Test(
+        'azure_region',
+        [
+            f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml',
+            f'sky exec {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep eastus2',  # Ensure the region is correct.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+            # A user program should not access SkyPilot runtime env python by default.
+            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
+            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Test zone ----------
+@pytest.mark.aws
+def test_aws_zone():
+    name = _get_cluster_name()
+    test = Test(
+        'aws_zone',
+        [
+            f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b',
+            f'sky exec {name} examples/minimal.yaml --zone us-east-2b',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep us-east-2b',  # Ensure the zone is correct.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.ibm
+def test_ibm_zone():
+    name = _get_cluster_name()
+    zone = 'eu-de-2'
+    test = Test(
+        'zone',
+        [
+            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}',
+            f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep {zone}',  # Ensure the zone is correct.
+        ],
+        f'sky down -y {name} {name}-2 {name}-3',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_zone():
+    name = _get_cluster_name()
+    test = Test(
+        'gcp_zone',
+        [
+            f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
+            f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status --all | grep {name} | grep us-central1-a',  # Ensure the zone is correct.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
diff --git a/tests/test_smoke.py b/tests/smoke_tests/test_smoke.py
similarity index 83%
rename from tests/test_smoke.py
rename to tests/smoke_tests/test_smoke.py
index a629816cb22..03132743c0e 100644
--- a/tests/test_smoke.py
+++ b/tests/smoke_tests/test_smoke.py
@@ -44,6 +44,27 @@
 import colorama
 import jinja2
 import pytest
+from smoke_tests.util import _get_cluster_name
+from smoke_tests.util import (
+    _get_cmd_wait_until_cluster_status_contains_wildcard)
+from smoke_tests.util import _GET_JOB_QUEUE
+from smoke_tests.util import _get_timeout
+from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
+from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
+from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+from smoke_tests.util import (
+    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
+from smoke_tests.util import (
+    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+from smoke_tests.util import FLUIDSTACK_TYPE
+from smoke_tests.util import LAMBDA_TYPE
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_GPU_V100
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
 
 import sky
 from sky import global_user_state
@@ -68,1020 +89,6 @@
 from sky.utils import resources_utils
 from sky.utils import subprocess_utils
 
-# To avoid the second smoke test reusing the cluster launched in the first
-# smoke test. Also required for test_managed_jobs_recovery to make sure the
-# manual termination with aws ec2 does not accidentally terminate other clusters
-# for for the different managed jobs launch with the same job name but a
-# different job id.
-test_id = str(uuid.uuid4())[-2:]
-
-LAMBDA_TYPE = '--cloud lambda --gpus A10'
-FLUIDSTACK_TYPE = '--cloud fluidstack --gpus RTXA4000'
-
-SCP_TYPE = '--cloud scp'
-SCP_GPU_V100 = '--gpus V100-32GB'
-
-STORAGE_SETUP_COMMANDS = [
-    'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir',
-    'touch ~/tmp-workdir/tmp\ file', 'touch ~/tmp-workdir/tmp\ file2',
-    'touch ~/tmp-workdir/foo',
-    '[ ! -e ~/tmp-workdir/circle-link ] && ln -s ~/tmp-workdir/ ~/tmp-workdir/circle-link || true',
-    'touch ~/.ssh/id_rsa.pub'
-]
-
-# Get the job queue, and print it once on its own, then print it again to
-# use with grep by the caller.
-_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"'
-# Wait for a job to be not in RUNNING state. Used to check for RECOVERING.
-_JOB_WAIT_NOT_RUNNING = (
-    's=$(sky jobs queue);'
-    'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do '
-    'sleep 10; s=$(sky jobs queue);'
-    'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
-
-# Cluster functions
-_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
-_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
-_ALL_MANAGED_JOB_STATUSES = "|".join(
-    [status.value for status in ManagedJobStatus])
-
-_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
-    # A while loop to wait until the cluster status
-    # becomes certain status, with timeout.
-    'start_time=$SECONDS; '
-    'while true; do '
-    'if (( $SECONDS - $start_time > {timeout} )); then '
-    '  echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; '
-    'fi; '
-    'current_status=$(sky status {cluster_name} --refresh | '
-    'awk "/^{cluster_name}/ '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
-    ')$/) print \$i}}"); '
-    'if [[ "$current_status" =~ {cluster_status} ]]; '
-    'then echo "Target cluster status {cluster_status} reached."; break; fi; '
-    'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
-    'sleep 10; '
-    'done')
-
-
-def _get_cmd_wait_until_cluster_status_contains_wildcard(
-        cluster_name_wildcard: str, cluster_status: str, timeout: int):
-    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
-        'sky status {cluster_name}',
-        'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
-                                               'awk "/^{cluster_name_awk}/')
-    return wait_cmd.format(cluster_name=cluster_name_wildcard,
-                           cluster_name_awk=cluster_name_wildcard.replace(
-                               '*', '.*'),
-                           cluster_status=cluster_status,
-                           timeout=timeout)
-
-
-_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
-    # A while loop to wait until the cluster is not found or timeout
-    'start_time=$SECONDS; '
-    'while true; do '
-    'if (( $SECONDS - $start_time > {timeout} )); then '
-    '  echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; '
-    'fi; '
-    'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
-    '  echo "Cluster {cluster_name} successfully removed."; break; '
-    'fi; '
-    'echo "Waiting for cluster {name} to be removed..."; '
-    'sleep 10; '
-    'done')
-
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
-    # A while loop to wait until the job status
-    # contains certain status, with timeout.
-    'start_time=$SECONDS; '
-    'while true; do '
-    'if (( $SECONDS - $start_time > {timeout} )); then '
-    '  echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; '
-    'fi; '
-    'current_status=$(sky queue {cluster_name} | '
-    'awk "\\$1 == \\"{job_id}\\" '
-    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
-    ')$/) print \$i}}"); '
-    'found=0; '  # Initialize found variable outside the loop
-    'while read -r line; do '  # Read line by line
-    '  if [[ "$line" =~ {job_status} ]]; then '  # Check each line
-    '    echo "Target job status {job_status} reached."; '
-    '    found=1; '
-    '    break; '  # Break inner loop
-    '  fi; '
-    'done <<< "$current_status"; '
-    'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
-    'echo "Waiting for job status to contains {job_status}, current status: $current_status"; '
-    'sleep 10; '
-    'done')
-
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
-    'awk "\\$1 == \\"{job_id}\\"', 'awk "')
-
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
-    'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
-
-# Managed job functions
-
-_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
-    'sky queue {cluster_name}', 'sky jobs queue').replace(
-        'awk "\\$2 == \\"{job_name}\\"',
-        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
-            _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES)
-
-# After the timeout, the cluster will stop if autostop is set, and our check
-# should be more than the timeout. To address this, we extend the timeout by
-# _BUMP_UP_SECONDS before exiting.
-_BUMP_UP_SECONDS = 35
-
-DEFAULT_CMD_TIMEOUT = 15 * 60
-
-
-class Test(NamedTuple):
-    name: str
-    # Each command is executed serially.  If any failed, the remaining commands
-    # are not run and the test is treated as failed.
-    commands: List[str]
-    teardown: Optional[str] = None
-    # Timeout for each command in seconds.
-    timeout: int = DEFAULT_CMD_TIMEOUT
-    # Environment variables to set for each command.
-    env: Dict[str, str] = None
-
-    def echo(self, message: str):
-        # pytest's xdist plugin captures stdout; print to stderr so that the
-        # logs are streaming while the tests are running.
-        prefix = f'[{self.name}]'
-        message = f'{prefix} {message}'
-        message = message.replace('\n', f'\n{prefix} ')
-        print(message, file=sys.stderr, flush=True)
-
-
-def _get_timeout(generic_cloud: str,
-                 override_timeout: int = DEFAULT_CMD_TIMEOUT):
-    timeouts = {'fluidstack': 60 * 60}  # file_mounts
-    return timeouts.get(generic_cloud, override_timeout)
-
-
-def _get_cluster_name() -> str:
-    """Returns a user-unique cluster name for each test_<name>().
-
-    Must be called from each test_<name>().
-    """
-    caller_func_name = inspect.stack()[1][3]
-    test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
-    test_name = common_utils.make_cluster_name_on_cloud(test_name,
-                                                        24,
-                                                        add_user_hash=False)
-    return f'{test_name}-{test_id}'
-
-
-def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
-    cluster_name = serve.generate_replica_cluster_name(name, replica_id)
-    query_cmd = (f'gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name:{cluster_name})" '
-                 f'--zones={zone} --format="value(name)"')
-    return (f'gcloud compute instances delete --zone={zone}'
-            f' --quiet $({query_cmd})')
-
-
-def run_one_test(test: Test) -> Tuple[int, str, str]:
-    # Fail fast if `sky` CLI somehow errors out.
-    subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True)
-    log_file = tempfile.NamedTemporaryFile('a',
-                                           prefix=f'{test.name}-',
-                                           suffix='.log',
-                                           delete=False)
-    test.echo(f'Test started. Log: less {log_file.name}')
-    env_dict = os.environ.copy()
-    if test.env:
-        env_dict.update(test.env)
-    for command in test.commands:
-        log_file.write(f'+ {command}\n')
-        log_file.flush()
-        proc = subprocess.Popen(
-            command,
-            stdout=log_file,
-            stderr=subprocess.STDOUT,
-            shell=True,
-            executable='/bin/bash',
-            env=env_dict,
-        )
-        try:
-            proc.wait(timeout=test.timeout)
-        except subprocess.TimeoutExpired as e:
-            log_file.flush()
-            test.echo(f'Timeout after {test.timeout} seconds.')
-            test.echo(str(e))
-            log_file.write(f'Timeout after {test.timeout} seconds.\n')
-            log_file.flush()
-            # Kill the current process.
-            proc.terminate()
-            proc.returncode = 1  # None if we don't set it.
-            break
-
-        if proc.returncode:
-            break
-
-    style = colorama.Style
-    fore = colorama.Fore
-    outcome = (f'{fore.RED}Failed{style.RESET_ALL}'
-               if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}')
-    reason = f'\nReason: {command}' if proc.returncode else ''
-    msg = (f'{outcome}.'
-           f'{reason}'
-           f'\nLog: less {log_file.name}\n')
-    test.echo(msg)
-    log_file.write(msg)
-    if (proc.returncode == 0 or
-            pytest.terminate_on_failure) and test.teardown is not None:
-        subprocess_utils.run(
-            test.teardown,
-            stdout=log_file,
-            stderr=subprocess.STDOUT,
-            timeout=10 * 60,  # 10 mins
-            shell=True,
-        )
-
-    if proc.returncode:
-        raise Exception(f'test failed: less {log_file.name}')
-
-
-def get_aws_region_for_quota_failover() -> Optional[str]:
-    candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge',
-                                                  accelerators=None,
-                                                  use_spot=True,
-                                                  region=None,
-                                                  zone=None)
-    original_resources = sky.Resources(cloud=sky.AWS(),
-                                       instance_type='p3.16xlarge',
-                                       use_spot=True)
-
-    # Filter the regions with proxy command in ~/.sky/config.yaml.
-    filtered_regions = original_resources.get_valid_regions_for_launchable()
-    candidate_regions = [
-        region for region in candidate_regions
-        if region.name in filtered_regions
-    ]
-
-    for region in candidate_regions:
-        resources = original_resources.copy(region=region.name)
-        if not AWS.check_quota_available(resources):
-            return region.name
-
-    return None
-
-
-def get_gcp_region_for_quota_failover() -> Optional[str]:
-
-    candidate_regions = GCP.regions_with_offering(instance_type=None,
-                                                  accelerators={'A100-80GB': 1},
-                                                  use_spot=True,
-                                                  region=None,
-                                                  zone=None)
-
-    original_resources = sky.Resources(cloud=sky.GCP(),
-                                       instance_type='a2-ultragpu-1g',
-                                       accelerators={'A100-80GB': 1},
-                                       use_spot=True)
-
-    # Filter the regions with proxy command in ~/.sky/config.yaml.
-    filtered_regions = original_resources.get_valid_regions_for_launchable()
-    candidate_regions = [
-        region for region in candidate_regions
-        if region.name in filtered_regions
-    ]
-
-    for region in candidate_regions:
-        if not GCP.check_quota_available(
-                original_resources.copy(region=region.name)):
-            return region.name
-
-    return None
-
-
-# ---------- Dry run: 2 Tasks in a chain. ----------
-@pytest.mark.no_fluidstack  #requires GCP and AWS set up
-def test_example_app():
-    test = Test(
-        'example_app',
-        ['python examples/example_app.py'],
-    )
-    run_one_test(test)
-
-
-_VALIDATE_LAUNCH_OUTPUT = (
-    # Validate the output of the job submission:
-    # ⚙️ Launching on Kubernetes.
-    #   Pod is up.
-    # ✓ Cluster launched: test. View logs at: ~/sky_logs/sky-2024-10-07-19-44-18-177288/provision.log
-    # ⚙️ Running setup on 1 pod.
-    # running setup
-    # ✓ Setup completed.
-    # ⚙️ Job submitted, ID: 1.
-    # ├── Waiting for task resources on 1 node.
-    # └── Job started. Streaming logs... (Ctrl-C to exit log streaming; job will not be killed)
-    # (min, pid=1277) # conda environments:
-    # (min, pid=1277) #
-    # (min, pid=1277) base                  *  /opt/conda
-    # (min, pid=1277)
-    # (min, pid=1277) task run finish
-    # ✓ Job finished (status: SUCCEEDED).
-    #
-    # Job ID: 1
-    # 📋 Useful Commands
-    # ├── To cancel the job:          sky cancel test 1
-    # ├── To stream job logs:         sky logs test 1
-    # └── To view job queue:          sky queue test
-    #
-    # Cluster name: test
-    # ├── To log into the head VM:    ssh test
-    # ├── To submit a job:            sky exec test yaml_file
-    # ├── To stop the cluster:        sky stop test
-    # └── To teardown the cluster:    sky down test
-    'echo "$s" && echo "==Validating launching==" && '
-    'echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
-    'echo "$s" && echo "==Validating setup output==" && '
-    'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
-    'echo "==Validating running output hints==" && echo "$s" | '
-    'grep -A 1 "Job submitted, ID:" | '
-    'grep "Waiting for task resources on " && '
-    'echo "==Validating task output starting==" && echo "$s" | '
-    'grep -A 1 "Job started. Streaming logs..." | grep "(min, pid=" && '
-    'echo "==Validating task output ending==" && '
-    'echo "$s" | grep -A 1 "task run finish" | '
-    'grep "Job finished (status: SUCCEEDED)" && '
-    'echo "==Validating task output ending 2==" && '
-    'echo "$s" | grep -A 5 "Job finished (status: SUCCEEDED)" | '
-    'grep "Job ID:" && '
-    'echo "$s" | grep -A 1 "Job ID:" | grep "Useful Commands"')
-
-
-# ---------- A minimal task ----------
-def test_minimal(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'minimal',
-        [
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
-            # Output validation done.
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
-            # Test launch output again on existing cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            # Check the logs downloading
-            f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log',
-            # Ensure the raylet process has the correct file descriptor limit.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # Install jq for the next test.
-            f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'',
-            # Check the cluster info
-            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'',
-            f'sky logs {name} 5 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
-            f'sky logs {name} 6 --status',  # Ensure the job succeeded.
-            # Test '-c' for exec
-            f'sky exec -c {name} echo',
-            f'sky logs {name} 7 --status',
-            f'sky exec echo -c {name}',
-            f'sky logs {name} 8 --status',
-            f'sky exec -c {name} echo hi test',
-            f'sky logs {name} 9 | grep "hi test"',
-            f'sky exec {name} && exit 1 || true',
-            f'sky exec -c {name} && exit 1 || true',
-        ],
-        f'sky down -y {name}',
-        _get_timeout(generic_cloud),
-    )
-    run_one_test(test)
-
-
-# ---------- Test fast launch ----------
-def test_launch_fast(generic_cloud: str):
-    name = _get_cluster_name()
-
-    test = Test(
-        'test_launch_fast',
-        [
-            # First launch to create the cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 1 --status',
-
-            # Second launch to test fast launch - should not reprovision
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && '
-            ' echo "$s" && '
-            # Validate that cluster was not re-launched.
-            '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
-            # Validate that setup was not re-run.
-            '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
-            # Validate that the task ran and finished.
-            'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=_get_timeout(generic_cloud),
-    )
-    run_one_test(test)
-
-
-# See cloud exclusion explanations in test_autostop
-@pytest.mark.no_fluidstack
-@pytest.mark.no_lambda_cloud
-@pytest.mark.no_ibm
-@pytest.mark.no_kubernetes
-def test_launch_fast_with_autostop(generic_cloud: str):
-    name = _get_cluster_name()
-    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
-    # the VM is stopped.
-    autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    test = Test(
-        'test_launch_fast_with_autostop',
-        [
-            # First launch to create the cluster with a short autostop
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 1 --status',
-            f'sky status -r {name} | grep UP',
-
-            # Ensure cluster is stopped
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=autostop_timeout),
-
-            # Launch again. Do full output validation - we expect the cluster to re-launch
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=_get_timeout(generic_cloud) + autostop_timeout,
-    )
-    run_one_test(test)
-
-
-# ---------- Test region ----------
-@pytest.mark.aws
-def test_aws_region():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_region',
-        [
-            f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml',
-            f'sky exec {name} examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_with_ssh_proxy_command():
-    name = _get_cluster_name()
-
-    with tempfile.NamedTemporaryFile(mode='w') as f:
-        f.write(
-            textwrap.dedent(f"""\
-        aws:
-            ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name}
-        """))
-        f.flush()
-        test = Test(
-            'aws_with_ssh_proxy_command',
-            [
-                f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1',
-                # Use jump config
-                f'export SKYPILOT_CONFIG={f.name}; '
-                f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi',
-                f'sky logs {name} 1 --status',
-                f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi',
-                f'sky logs {name} 2 --status',
-                # Start a small job to make sure the controller is created.
-                f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
-                # Wait other tests to create the job controller first, so that
-                # the job controller is not launched with proxy command.
-                _get_cmd_wait_until_cluster_status_contains_wildcard(
-                    cluster_name_wildcard='sky-jobs-controller-*',
-                    cluster_status=ClusterStatus.UP.value,
-                    timeout=300),
-                f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
-                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
-                format(
-                    job_name=name,
-                    job_status=
-                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})',
-                    timeout=300),
-            ],
-            f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
-        )
-        run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_region_and_service_account():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_region',
-        [
-            f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-central1',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_region():
-    name = _get_cluster_name()
-    region = 'eu-de'
-    test = Test(
-        'region',
-        [
-            f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml',
-            f'sky exec {name} --cloud ibm examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep {region}',  # Ensure the region is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_region():
-    name = _get_cluster_name()
-    test = Test(
-        'azure_region',
-        [
-            f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml',
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep eastus2',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Test zone ----------
-@pytest.mark.aws
-def test_aws_zone():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_zone',
-        [
-            f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b',
-            f'sky exec {name} examples/minimal.yaml --zone us-east-2b',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-east-2b',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_zone():
-    name = _get_cluster_name()
-    zone = 'eu-de-2'
-    test = Test(
-        'zone',
-        [
-            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}',
-            f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep {zone}',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name} {name}-2 {name}-3',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_zone():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_zone',
-        [
-            f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-central1-a',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Test the image ----------
-@pytest.mark.aws
-def test_aws_images():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} examples/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_images():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_images():
-    name = _get_cluster_name()
-    test = Test(
-        'azure_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_image_id_dict',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_image_id_dict',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict_region():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_image_id_dict_region',
-        [
-            # YAML has
-            #   image_id:
-            #       us-west-2: skypilot:gpu-ubuntu-1804
-            #       us-east-2: skypilot:gpu-ubuntu-2004
-            # Use region to filter image_id dict.
-            f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
-            # Should success because the image id match for the region.
-            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
-            # Ensure exec works.
-            f'sky exec {name} --region us-east-2 examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict_region():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_image_id_dict_region',
-        [
-            # Use region to filter image_id dict.
-            f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
-            # Should success because the image id match for the region.
-            f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-west3',  # Ensure the region is correct.
-            # Ensure exec works.
-            f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} --cloud gcp --region us-west3 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict_zone():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_image_id_dict_zone',
-        [
-            # YAML has
-            #   image_id:
-            #       us-west-2: skypilot:gpu-ubuntu-1804
-            #       us-east-2: skypilot:gpu-ubuntu-2004
-            # Use zone to filter image_id dict.
-            f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml',
-            # Should success because the image id match for the zone.
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            # Fail due to image id mismatch.
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-east-2a',  # Ensure the zone is correct.
-            # Ensure exec works.
-            f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict_zone():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_image_id_dict_zone',
-        [
-            # Use zone to filter image_id dict.
-            f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
-            # Should success because the image id match for the zone.
-            f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
-            # Fail due to image id mismatch.
-            f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-central1',  # Ensure the zone is correct.
-            # Ensure exec works.
-            f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} --cloud gcp --region us-central1 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_clone_disk_aws():
-    name = _get_cluster_name()
-    test = Test(
-        'clone_disk_aws',
-        [
-            f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
-            f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=60),
-            # Wait for EC2 instance to be in stopped state.
-            # TODO: event based wait.
-            'sleep 60',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
-            f'sky logs {name}-clone 1 --status',
-            f'sky logs {name}-clone-2 1 --status',
-        ],
-        f'sky down -y {name} {name}-clone {name}-clone-2',
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_clone_disk_gcp():
-    name = _get_cluster_name()
-    test = Test(
-        'clone_disk_gcp',
-        [
-            f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
-            f'sky stop {name} -y',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"',
-            f'sky logs {name}-clone 1 --status',
-            f'sky logs {name}-clone-2 1 --status',
-        ],
-        f'sky down -y {name} {name}-clone {name}-clone-2',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_mig():
-    name = _get_cluster_name()
-    region = 'us-central1'
-    test = Test(
-        'gcp_mig',
-        [
-            f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            # Check MIG exists.
-            f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
-            f'sky autostop -i 0 --down -y {name}',
-            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
-                                                    timeout=120),
-            f'gcloud compute instance-templates list | grep "sky-it-{name}"',
-            # Launch again with the same region. The original instance template
-            # should be removed.
-            f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi',
-            f'sky logs {name} 1 | grep "L4"',
-            f'sky down -y {name}',
-            f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true',
-        ],
-        f'sky down -y {name}',
-        env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'})
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_force_enable_external_ips():
-    name = _get_cluster_name()
-    test_commands = [
-        f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
-        # Check network of vm is "default"
-        (f'gcloud compute instances list --filter=name~"{name}" --format='
-         '"value(networkInterfaces.network)" | grep "networks/default"'),
-        # Check External NAT in network access configs, corresponds to external ip
-        (f'gcloud compute instances list --filter=name~"{name}" --format='
-         '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"'
-        ),
-        f'sky down -y {name}',
-    ]
-    skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml'
-    test = Test('gcp_force_enable_external_ips',
-                test_commands,
-                f'sky down -y {name}',
-                env={'SKYPILOT_CONFIG': skypilot_config})
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_image_no_conda():
-    name = _get_cluster_name()
-    test = Test(
-        'image_no_conda',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
-            f'sky logs {name} 1 --status',
-            f'sky stop {name} -y',
-            f'sky start {name} -y',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
-@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
-def test_custom_default_conda_env(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test('custom_default_conda_env', [
-        f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky status -r {name} | grep "UP"',
-        f'sky logs {name} 1 --status',
-        f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"',
-        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky logs {name} 2 --status',
-        f'sky autostop -y -i 0 {name}',
-        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-            cluster_name=name,
-            cluster_status=ClusterStatus.STOPPED.value,
-            timeout=80),
-        f'sky start -y {name}',
-        f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
-        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky logs {name} 3 --status',
-    ], f'sky down -y {name}')
-    run_one_test(test)
-
 
 # ------------ Test stale job ------------
 @pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
new file mode 100644
index 00000000000..ebd71e9a10e
--- /dev/null
+++ b/tests/smoke_tests/util.py
@@ -0,0 +1,381 @@
+import enum
+import inspect
+import json
+import os
+import pathlib
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from typing import Dict, List, NamedTuple, Optional, Tuple
+import urllib.parse
+import uuid
+
+import colorama
+import jinja2
+import pytest
+
+import sky
+from sky import global_user_state
+from sky import jobs
+from sky import serve
+from sky import skypilot_config
+from sky.adaptors import azure
+from sky.adaptors import cloudflare
+from sky.adaptors import ibm
+from sky.clouds import AWS
+from sky.clouds import Azure
+from sky.clouds import GCP
+from sky.data import data_utils
+from sky.data import storage as storage_lib
+from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
+from sky.skylet import constants
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
+from sky.utils import resources_utils
+from sky.utils import subprocess_utils
+
+# To avoid the second smoke test reusing the cluster launched in the first
+# smoke test. Also required for test_managed_jobs_recovery to make sure the
+# manual termination with aws ec2 does not accidentally terminate other clusters
+# for for the different managed jobs launch with the same job name but a
+# different job id.
+test_id = str(uuid.uuid4())[-2:]
+
+LAMBDA_TYPE = '--cloud lambda --gpus A10'
+FLUIDSTACK_TYPE = '--cloud fluidstack --gpus RTXA4000'
+
+SCP_TYPE = '--cloud scp'
+SCP_GPU_V100 = '--gpus V100-32GB'
+
+STORAGE_SETUP_COMMANDS = [
+    'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir',
+    'touch ~/tmp-workdir/tmp\ file', 'touch ~/tmp-workdir/tmp\ file2',
+    'touch ~/tmp-workdir/foo',
+    '[ ! -e ~/tmp-workdir/circle-link ] && ln -s ~/tmp-workdir/ ~/tmp-workdir/circle-link || true',
+    'touch ~/.ssh/id_rsa.pub'
+]
+
+# Get the job queue, and print it once on its own, then print it again to
+# use with grep by the caller.
+_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"'
+# Wait for a job to be not in RUNNING state. Used to check for RECOVERING.
+_JOB_WAIT_NOT_RUNNING = (
+    's=$(sky jobs queue);'
+    'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do '
+    'sleep 10; s=$(sky jobs queue);'
+    'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
+
+# Cluster functions
+_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
+_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
+_ALL_MANAGED_JOB_STATUSES = "|".join(
+    [status.value for status in ManagedJobStatus])
+
+_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
+    # A while loop to wait until the cluster status
+    # becomes certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky status {cluster_name} --refresh | '
+    'awk "/^{cluster_name}/ '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
+    ')$/) print \$i}}"); '
+    'if [[ "$current_status" =~ {cluster_status} ]]; '
+    'then echo "Target cluster status {cluster_status} reached."; break; fi; '
+    'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
+    'sleep 10; '
+    'done')
+
+
+def _get_cmd_wait_until_cluster_status_contains_wildcard(
+        cluster_name_wildcard: str, cluster_status: str, timeout: int):
+    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
+        'sky status {cluster_name}',
+        'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
+                                               'awk "/^{cluster_name_awk}/')
+    return wait_cmd.format(cluster_name=cluster_name_wildcard,
+                           cluster_name_awk=cluster_name_wildcard.replace(
+                               '*', '.*'),
+                           cluster_status=cluster_status,
+                           timeout=timeout)
+
+
+_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
+    # A while loop to wait until the cluster is not found or timeout
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; '
+    'fi; '
+    'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
+    '  echo "Cluster {cluster_name} successfully removed."; break; '
+    'fi; '
+    'echo "Waiting for cluster {name} to be removed..."; '
+    'sleep 10; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
+    # A while loop to wait until the job status
+    # contains certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky queue {cluster_name} | '
+    'awk "\\$1 == \\"{job_id}\\" '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
+    ')$/) print \$i}}"); '
+    'found=0; '  # Initialize found variable outside the loop
+    'while read -r line; do '  # Read line by line
+    '  if [[ "$line" =~ {job_status} ]]; then '  # Check each line
+    '    echo "Target job status {job_status} reached."; '
+    '    found=1; '
+    '    break; '  # Break inner loop
+    '  fi; '
+    'done <<< "$current_status"; '
+    'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
+    'echo "Waiting for job status to contains {job_status}, current status: $current_status"; '
+    'sleep 10; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
+
+# Managed job functions
+
+_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
+    'sky queue {cluster_name}', 'sky jobs queue').replace(
+        'awk "\\$2 == \\"{job_name}\\"',
+        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
+            _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES)
+
+# After the timeout, the cluster will stop if autostop is set, and our check
+# should be more than the timeout. To address this, we extend the timeout by
+# _BUMP_UP_SECONDS before exiting.
+_BUMP_UP_SECONDS = 35
+
+DEFAULT_CMD_TIMEOUT = 15 * 60
+
+
+class Test(NamedTuple):
+    name: str
+    # Each command is executed serially.  If any failed, the remaining commands
+    # are not run and the test is treated as failed.
+    commands: List[str]
+    teardown: Optional[str] = None
+    # Timeout for each command in seconds.
+    timeout: int = DEFAULT_CMD_TIMEOUT
+    # Environment variables to set for each command.
+    env: Dict[str, str] = None
+
+    def echo(self, message: str):
+        # pytest's xdist plugin captures stdout; print to stderr so that the
+        # logs are streaming while the tests are running.
+        prefix = f'[{self.name}]'
+        message = f'{prefix} {message}'
+        message = message.replace('\n', f'\n{prefix} ')
+        print(message, file=sys.stderr, flush=True)
+
+
+def _get_timeout(generic_cloud: str,
+                 override_timeout: int = DEFAULT_CMD_TIMEOUT):
+    timeouts = {'fluidstack': 60 * 60}  # file_mounts
+    return timeouts.get(generic_cloud, override_timeout)
+
+
+def _get_cluster_name() -> str:
+    """Returns a user-unique cluster name for each test_<name>().
+
+    Must be called from each test_<name>().
+    """
+    caller_func_name = inspect.stack()[1][3]
+    test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
+    test_name = common_utils.make_cluster_name_on_cloud(test_name,
+                                                        24,
+                                                        add_user_hash=False)
+    return f'{test_name}-{test_id}'
+
+
+def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
+    cluster_name = serve.generate_replica_cluster_name(name, replica_id)
+    query_cmd = (f'gcloud compute instances list --filter='
+                 f'"(labels.ray-cluster-name:{cluster_name})" '
+                 f'--zones={zone} --format="value(name)"')
+    return (f'gcloud compute instances delete --zone={zone}'
+            f' --quiet $({query_cmd})')
+
+
+def run_one_test(test: Test) -> Tuple[int, str, str]:
+    # Fail fast if `sky` CLI somehow errors out.
+    subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True)
+    log_file = tempfile.NamedTemporaryFile('a',
+                                           prefix=f'{test.name}-',
+                                           suffix='.log',
+                                           delete=False)
+    test.echo(f'Test started. Log: less {log_file.name}')
+    env_dict = os.environ.copy()
+    if test.env:
+        env_dict.update(test.env)
+    for command in test.commands:
+        log_file.write(f'+ {command}\n')
+        log_file.flush()
+        proc = subprocess.Popen(
+            command,
+            stdout=log_file,
+            stderr=subprocess.STDOUT,
+            shell=True,
+            executable='/bin/bash',
+            env=env_dict,
+        )
+        try:
+            proc.wait(timeout=test.timeout)
+        except subprocess.TimeoutExpired as e:
+            log_file.flush()
+            test.echo(f'Timeout after {test.timeout} seconds.')
+            test.echo(str(e))
+            log_file.write(f'Timeout after {test.timeout} seconds.\n')
+            log_file.flush()
+            # Kill the current process.
+            proc.terminate()
+            proc.returncode = 1  # None if we don't set it.
+            break
+
+        if proc.returncode:
+            break
+
+    style = colorama.Style
+    fore = colorama.Fore
+    outcome = (f'{fore.RED}Failed{style.RESET_ALL}'
+               if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}')
+    reason = f'\nReason: {command}' if proc.returncode else ''
+    msg = (f'{outcome}.'
+           f'{reason}'
+           f'\nLog: less {log_file.name}\n')
+    test.echo(msg)
+    log_file.write(msg)
+    if (proc.returncode == 0 or
+            pytest.terminate_on_failure) and test.teardown is not None:
+        subprocess_utils.run(
+            test.teardown,
+            stdout=log_file,
+            stderr=subprocess.STDOUT,
+            timeout=10 * 60,  # 10 mins
+            shell=True,
+        )
+
+    if proc.returncode:
+        raise Exception(f'test failed: less {log_file.name}')
+
+
+def get_aws_region_for_quota_failover() -> Optional[str]:
+    candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge',
+                                                  accelerators=None,
+                                                  use_spot=True,
+                                                  region=None,
+                                                  zone=None)
+    original_resources = sky.Resources(cloud=sky.AWS(),
+                                       instance_type='p3.16xlarge',
+                                       use_spot=True)
+
+    # Filter the regions with proxy command in ~/.sky/config.yaml.
+    filtered_regions = original_resources.get_valid_regions_for_launchable()
+    candidate_regions = [
+        region for region in candidate_regions
+        if region.name in filtered_regions
+    ]
+
+    for region in candidate_regions:
+        resources = original_resources.copy(region=region.name)
+        if not AWS.check_quota_available(resources):
+            return region.name
+
+    return None
+
+
+def get_gcp_region_for_quota_failover() -> Optional[str]:
+
+    candidate_regions = GCP.regions_with_offering(instance_type=None,
+                                                  accelerators={'A100-80GB': 1},
+                                                  use_spot=True,
+                                                  region=None,
+                                                  zone=None)
+
+    original_resources = sky.Resources(cloud=sky.GCP(),
+                                       instance_type='a2-ultragpu-1g',
+                                       accelerators={'A100-80GB': 1},
+                                       use_spot=True)
+
+    # Filter the regions with proxy command in ~/.sky/config.yaml.
+    filtered_regions = original_resources.get_valid_regions_for_launchable()
+    candidate_regions = [
+        region for region in candidate_regions
+        if region.name in filtered_regions
+    ]
+
+    for region in candidate_regions:
+        if not GCP.check_quota_available(
+                original_resources.copy(region=region.name)):
+            return region.name
+
+    return None
+
+
+_VALIDATE_LAUNCH_OUTPUT = (
+    # Validate the output of the job submission:
+    # ⚙️ Launching on Kubernetes.
+    #   Pod is up.
+    # ✓ Cluster launched: test. View logs at: ~/sky_logs/sky-2024-10-07-19-44-18-177288/provision.log
+    # ⚙️ Running setup on 1 pod.
+    # running setup
+    # ✓ Setup completed.
+    # ⚙️ Job submitted, ID: 1.
+    # ├── Waiting for task resources on 1 node.
+    # └── Job started. Streaming logs... (Ctrl-C to exit log streaming; job will not be killed)
+    # (min, pid=1277) # conda environments:
+    # (min, pid=1277) #
+    # (min, pid=1277) base                  *  /opt/conda
+    # (min, pid=1277)
+    # (min, pid=1277) task run finish
+    # ✓ Job finished (status: SUCCEEDED).
+    #
+    # Job ID: 1
+    # 📋 Useful Commands
+    # ├── To cancel the job:          sky cancel test 1
+    # ├── To stream job logs:         sky logs test 1
+    # └── To view job queue:          sky queue test
+    #
+    # Cluster name: test
+    # ├── To log into the head VM:    ssh test
+    # ├── To submit a job:            sky exec test yaml_file
+    # ├── To stop the cluster:        sky stop test
+    # └── To teardown the cluster:    sky down test
+    'echo "$s" && echo "==Validating launching==" && '
+    'echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
+    'echo "$s" && echo "==Validating setup output==" && '
+    'echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
+    'echo "==Validating running output hints==" && echo "$s" | '
+    'grep -A 1 "Job submitted, ID:" | '
+    'grep "Waiting for task resources on " && '
+    'echo "==Validating task output starting==" && echo "$s" | '
+    'grep -A 1 "Job started. Streaming logs..." | grep "(min, pid=" && '
+    'echo "==Validating task output ending==" && '
+    'echo "$s" | grep -A 1 "task run finish" | '
+    'grep "Job finished (status: SUCCEEDED)" && '
+    'echo "==Validating task output ending 2==" && '
+    'echo "$s" | grep -A 5 "Job finished (status: SUCCEEDED)" | '
+    'grep "Job ID:" && '
+    'echo "$s" | grep -A 1 "Job ID:" | grep "Useful Commands"')

From 9abd4d43e0c2d13c99f57ea4615087d064629cfe Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 21 Nov 2024 17:01:04 +0800
Subject: [PATCH 20/64] fix import

---
 tests/smoke_tests/test_basic.py           | 54 ----------------------
 tests/smoke_tests/test_images.py          | 55 -----------------------
 tests/smoke_tests/test_region_and_zone.py | 33 --------------
 tests/smoke_tests/test_smoke.py           | 14 +++---
 4 files changed, 6 insertions(+), 150 deletions(-)

diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index 9d8a1225e42..c0996e135d0 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -25,69 +25,15 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 
-import enum
-import inspect
-import json
-import os
-import pathlib
-import shlex
-import shutil
-import subprocess
-import sys
-import tempfile
-import textwrap
-import time
-from typing import Dict, List, NamedTuple, Optional, Tuple
-import urllib.parse
-import uuid
-
-import colorama
-import jinja2
 import pytest
 from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import (
-    _get_cmd_wait_until_cluster_status_contains_wildcard)
-from smoke_tests.util import _GET_JOB_QUEUE
 from smoke_tests.util import _get_timeout
-from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
 from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
-from smoke_tests.util import (
-    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
-from smoke_tests.util import (
-    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
-from smoke_tests.util import FLUIDSTACK_TYPE
-from smoke_tests.util import LAMBDA_TYPE
 from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_GPU_V100
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
 
-import sky
-from sky import global_user_state
-from sky import jobs
-from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
-from sky.clouds import AWS
-from sky.clouds import Azure
-from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
-from sky.jobs.state import ManagedJobStatus
-from sky.skylet import constants
-from sky.skylet import events
-from sky.skylet.job_lib import JobStatus
 from sky.status_lib import ClusterStatus
-from sky.utils import common_utils
-from sky.utils import resources_utils
-from sky.utils import subprocess_utils
 
 
 # ---------- Dry run: 2 Tasks in a chain. ----------
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
index 42438461f76..96ce2f59c0c 100644
--- a/tests/smoke_tests/test_images.py
+++ b/tests/smoke_tests/test_images.py
@@ -25,69 +25,14 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 
-import enum
-import inspect
-import json
-import os
-import pathlib
-import shlex
-import shutil
-import subprocess
-import sys
-import tempfile
-import textwrap
-import time
-from typing import Dict, List, NamedTuple, Optional, Tuple
-import urllib.parse
-import uuid
-
-import colorama
-import jinja2
 import pytest
 from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import (
-    _get_cmd_wait_until_cluster_status_contains_wildcard)
-from smoke_tests.util import _GET_JOB_QUEUE
-from smoke_tests.util import _get_timeout
-from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
-from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
-from smoke_tests.util import (
-    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
-from smoke_tests.util import (
-    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
-from smoke_tests.util import FLUIDSTACK_TYPE
-from smoke_tests.util import LAMBDA_TYPE
 from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_GPU_V100
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
 
-import sky
-from sky import global_user_state
-from sky import jobs
-from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
-from sky.clouds import AWS
-from sky.clouds import Azure
-from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
-from sky.jobs.state import ManagedJobStatus
-from sky.skylet import constants
-from sky.skylet import events
-from sky.skylet.job_lib import JobStatus
 from sky.status_lib import ClusterStatus
-from sky.utils import common_utils
-from sky.utils import resources_utils
-from sky.utils import subprocess_utils
 
 
 # ---------- Test the image ----------
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index 57f84ff4a0e..0fc7ce409fc 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -47,47 +47,14 @@
 from smoke_tests.util import _get_cluster_name
 from smoke_tests.util import (
     _get_cmd_wait_until_cluster_status_contains_wildcard)
-from smoke_tests.util import _GET_JOB_QUEUE
-from smoke_tests.util import _get_timeout
-from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
-from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
-from smoke_tests.util import (
-    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
 from smoke_tests.util import (
     _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
-from smoke_tests.util import FLUIDSTACK_TYPE
-from smoke_tests.util import LAMBDA_TYPE
 from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_GPU_V100
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
 
-import sky
-from sky import global_user_state
-from sky import jobs
-from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
-from sky.clouds import AWS
-from sky.clouds import Azure
-from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
 from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
-from sky.skylet import events
-from sky.skylet.job_lib import JobStatus
 from sky.status_lib import ClusterStatus
-from sky.utils import common_utils
-from sky.utils import resources_utils
-from sky.utils import subprocess_utils
 
 
 # ---------- Test region ----------
diff --git a/tests/smoke_tests/test_smoke.py b/tests/smoke_tests/test_smoke.py
index 03132743c0e..348c880d7a7 100644
--- a/tests/smoke_tests/test_smoke.py
+++ b/tests/smoke_tests/test_smoke.py
@@ -25,7 +25,6 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 
-import enum
 import inspect
 import json
 import os
@@ -37,34 +36,33 @@
 import tempfile
 import textwrap
 import time
-from typing import Dict, List, NamedTuple, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 import urllib.parse
 import uuid
 
-import colorama
 import jinja2
 import pytest
+from smoke_tests.util import _BUMP_UP_SECONDS
 from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import (
-    _get_cmd_wait_until_cluster_status_contains_wildcard)
 from smoke_tests.util import _GET_JOB_QUEUE
 from smoke_tests.util import _get_timeout
 from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
-from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
+from smoke_tests.util import _terminate_gcp_replica
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
 from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
 from smoke_tests.util import (
     _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
 from smoke_tests.util import (
     _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
-from smoke_tests.util import FLUIDSTACK_TYPE
+from smoke_tests.util import get_aws_region_for_quota_failover
+from smoke_tests.util import get_gcp_region_for_quota_failover
 from smoke_tests.util import LAMBDA_TYPE
 from smoke_tests.util import run_one_test
 from smoke_tests.util import SCP_GPU_V100
 from smoke_tests.util import SCP_TYPE
 from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
+from smoke_tests.util import test_id
 
 import sky
 from sky import global_user_state

From e0a4c9fdf98a711be66f7921778e74f41af127fe Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 21 Nov 2024 17:56:44 +0800
Subject: [PATCH 21/64] buildkite config

---
 .buildkite/pipeline.yaml  |  5 ++++
 tests/smoke_tests/util.py | 52 ++++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 17 deletions(-)
 create mode 100644 .buildkite/pipeline.yaml

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
new file mode 100644
index 00000000000..21efc41de1d
--- /dev/null
+++ b/.buildkite/pipeline.yaml
@@ -0,0 +1,5 @@
+steps:
+  - label: "smoke test -> test_minimal"
+    command: "pytest tests/smoke_tests/test_basic.py::test_minimal"
+    env:
+      LOG_TO_STDOUT: "1"
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index ebd71e9a10e..c413bc6f2be 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -222,20 +222,31 @@ def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
 def run_one_test(test: Test) -> Tuple[int, str, str]:
     # Fail fast if `sky` CLI somehow errors out.
     subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True)
-    log_file = tempfile.NamedTemporaryFile('a',
-                                           prefix=f'{test.name}-',
-                                           suffix='.log',
-                                           delete=False)
-    test.echo(f'Test started. Log: less {log_file.name}')
+    log_to_stdout = os.environ.get('LOG_TO_STDOUT', None)
+    if log_to_stdout:
+        write = test.echo
+        flush = lambda: None
+        out = sys.stdout
+        test.echo(f'Test started. Log to stdout')
+    else:
+        log_file = tempfile.NamedTemporaryFile('a',
+                                               prefix=f'{test.name}-',
+                                               suffix='.log',
+                                               delete=False)
+        write = log_file.write
+        flush = log_file.flush
+        out = log_file
+        test.echo(f'Test started. Log: less {log_file.name}')
+
     env_dict = os.environ.copy()
     if test.env:
         env_dict.update(test.env)
     for command in test.commands:
-        log_file.write(f'+ {command}\n')
-        log_file.flush()
+        write(f'+ {command}\n')
+        flush()
         proc = subprocess.Popen(
             command,
-            stdout=log_file,
+            stdout=out,
             stderr=subprocess.STDOUT,
             shell=True,
             executable='/bin/bash',
@@ -244,11 +255,11 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
         try:
             proc.wait(timeout=test.timeout)
         except subprocess.TimeoutExpired as e:
-            log_file.flush()
+            flush()
             test.echo(f'Timeout after {test.timeout} seconds.')
             test.echo(str(e))
-            log_file.write(f'Timeout after {test.timeout} seconds.\n')
-            log_file.flush()
+            write(f'Timeout after {test.timeout} seconds.\n')
+            flush()
             # Kill the current process.
             proc.terminate()
             proc.returncode = 1  # None if we don't set it.
@@ -263,22 +274,29 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
                if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}')
     reason = f'\nReason: {command}' if proc.returncode else ''
     msg = (f'{outcome}.'
-           f'{reason}'
-           f'\nLog: less {log_file.name}\n')
-    test.echo(msg)
-    log_file.write(msg)
+           f'{reason}')
+    if log_to_stdout:
+        test.echo(msg)
+    else:
+        msg += f'\nLog: less {log_file.name}\n'
+        test.echo(msg)
+        write(msg)
+
     if (proc.returncode == 0 or
             pytest.terminate_on_failure) and test.teardown is not None:
         subprocess_utils.run(
             test.teardown,
-            stdout=log_file,
+            stdout=out,
             stderr=subprocess.STDOUT,
             timeout=10 * 60,  # 10 mins
             shell=True,
         )
 
     if proc.returncode:
-        raise Exception(f'test failed: less {log_file.name}')
+        if log_to_stdout:
+            raise Exception(f'test failed')
+        else:
+            raise Exception(f'test failed: less {log_file.name}')
 
 
 def get_aws_region_for_quota_failover() -> Optional[str]:

From 58090a340584ee810ded86b7db6a1c2ab22e873f Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 21 Nov 2024 18:09:08 +0800
Subject: [PATCH 22/64] fix stdout problem

---
 tests/smoke_tests/util.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index c413bc6f2be..322c19a266e 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -226,7 +226,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
     if log_to_stdout:
         write = test.echo
         flush = lambda: None
-        out = sys.stdout
+        subprocess_out = sys.stderr
         test.echo(f'Test started. Log to stdout')
     else:
         log_file = tempfile.NamedTemporaryFile('a',
@@ -235,7 +235,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
                                                delete=False)
         write = log_file.write
         flush = log_file.flush
-        out = log_file
+        subprocess_out = log_file
         test.echo(f'Test started. Log: less {log_file.name}')
 
     env_dict = os.environ.copy()
@@ -246,7 +246,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
         flush()
         proc = subprocess.Popen(
             command,
-            stdout=out,
+            stdout=subprocess_out,
             stderr=subprocess.STDOUT,
             shell=True,
             executable='/bin/bash',
@@ -286,7 +286,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
             pytest.terminate_on_failure) and test.teardown is not None:
         subprocess_utils.run(
             test.teardown,
-            stdout=out,
+            stdout=subprocess_out,
             stderr=subprocess.STDOUT,
             timeout=10 * 60,  # 10 mins
             shell=True,

From 88b396f11793676204f679d19164cadc9b105fac Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 22 Nov 2024 11:02:29 +0800
Subject: [PATCH 23/64] update pipeline test

---
 .buildkite/pipeline.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 21efc41de1d..679b463580e 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -1,5 +1,10 @@
 steps:
-  - label: "smoke test -> test_minimal"
+  - label: "test_minimal"
     command: "pytest tests/smoke_tests/test_basic.py::test_minimal"
     env:
       LOG_TO_STDOUT: "1"
+
+  - label: "test_aws_stale_job_manual_restart"
+    command: "pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart"
+    env:
+      LOG_TO_STDOUT: "1"

From 9405b449fc2d13e45d20eacd3a97d7c5eca408b7 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 22 Nov 2024 11:31:01 +0800
Subject: [PATCH 24/64] test again

---
 .buildkite/pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 679b463580e..4d8ed6ff8f0 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -4,7 +4,7 @@ steps:
     env:
       LOG_TO_STDOUT: "1"
 
-  - label: "test_aws_stale_job_manual_restart"
-    command: "pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart"
+  - label: "test_launch_fast"
+    command: "pytest tests/smoke_tests/test_basic.py::test_launch_fast"
     env:
       LOG_TO_STDOUT: "1"

From 5a2409f2e2cdc4f588d5aa3a482f65a936e05c50 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 22 Nov 2024 15:34:26 +0800
Subject: [PATCH 25/64] smoke test for buildkite

---
 .buildkite/generate_pipeline.py               |  107 +
 .buildkite/pipeline.yaml                      |   10 -
 .buildkite/pipeline_smoke_test_basic.yaml     |   91 +
 .../pipeline_smoke_test_cluster_job.yaml      |  265 +
 .buildkite/pipeline_smoke_test_images.yaml    |   66 +
 .../pipeline_smoke_test_managed_job.yaml      |   79 +
 ...pipeline_smoke_test_mount_and_storage.yaml |  139 +
 .../pipeline_smoke_test_region_and_zone.yaml  |   36 +
 ...line_smoke_test_required_before_merge.yaml |    7 +
 .buildkite/pipeline_smoke_test_sky_serve.yaml |  106 +
 tests/smoke_tests/test_basic.py               |  511 +-
 tests/smoke_tests/test_cluster_job.py         | 1657 ++++++
 tests/smoke_tests/test_images.py              |   50 +-
 tests/smoke_tests/test_managed_job.py         |  766 +++
 tests/smoke_tests/test_mount_and_storage.py   | 1503 +++++
 tests/smoke_tests/test_region_and_zone.py     |   65 +-
 .../smoke_tests/test_required_before_merge.py |   46 +
 tests/smoke_tests/test_sky_serve.py           |  795 +++
 tests/smoke_tests/test_smoke.py               | 5077 -----------------
 tests/smoke_tests/util.py                     |   54 +-
 tests/test_smoke.py                           |   36 +
 .../minimal_test_required_before_merge.yaml   |   13 +
 22 files changed, 6254 insertions(+), 5225 deletions(-)
 create mode 100644 .buildkite/generate_pipeline.py
 delete mode 100644 .buildkite/pipeline.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_basic.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_cluster_job.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_images.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_managed_job.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_mount_and_storage.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_region_and_zone.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_required_before_merge.yaml
 create mode 100644 .buildkite/pipeline_smoke_test_sky_serve.yaml
 create mode 100644 tests/smoke_tests/test_cluster_job.py
 create mode 100644 tests/smoke_tests/test_managed_job.py
 create mode 100644 tests/smoke_tests/test_mount_and_storage.py
 create mode 100644 tests/smoke_tests/test_required_before_merge.py
 create mode 100644 tests/smoke_tests/test_sky_serve.py
 delete mode 100644 tests/smoke_tests/test_smoke.py
 create mode 100644 tests/test_smoke.py
 create mode 100644 tests/test_yamls/minimal_test_required_before_merge.yaml

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
new file mode 100644
index 00000000000..b363c695057
--- /dev/null
+++ b/.buildkite/generate_pipeline.py
@@ -0,0 +1,107 @@
+"""This script generates a Buildkite pipeline from test files."""
+import ast
+import copy
+import os
+from typing import Any, Dict, List
+
+import yaml
+
+DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
+
+
+def _get_full_decorator_path(decorator: ast.AST) -> str:
+    """Recursively get the full path of a decorator."""
+    if isinstance(decorator, ast.Attribute):
+        return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}'
+    elif isinstance(decorator, ast.Name):
+        return decorator.id
+    elif isinstance(decorator, ast.Call):
+        return _get_full_decorator_path(decorator.func)
+    raise ValueError(f'Unknown decorator type: {type(decorator)}')
+
+
+def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
+    """Extract test functions and filter clouds with pytest.mark
+    from a Python test file."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        tree = ast.parse(file.read(), filename=file_path)
+
+    for node in ast.walk(tree):
+        for child in ast.iter_child_nodes(node):
+            setattr(child, 'parent', node)
+
+    function_cloud_map = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
+            class_name = None
+            if hasattr(node, 'parent') and isinstance(node.parent,
+                                                      ast.ClassDef):
+                class_name = node.parent.name
+
+            clouds_to_include = []
+            clouds_to_exclude = []
+            for decorator in node.decorator_list:
+                if isinstance(decorator, ast.Call):
+                    # We only need to consider the decorator with no arguments
+                    # to extract clouds.
+                    continue
+                full_path = _get_full_decorator_path(decorator)
+                if full_path.startswith('pytest.mark.'):
+                    assert isinstance(decorator, ast.Attribute)
+                    suffix = decorator.attr
+                    if suffix.startswith('no_'):
+                        clouds_to_exclude.append(suffix[3:])
+                    else:
+                        clouds_to_include.append(suffix)
+            clouds_to_include = (clouds_to_include if clouds_to_include else
+                                 copy.deepcopy(DEFAULT_CLOUDS_TO_RUN))
+            clouds_to_include = [
+                cloud for cloud in clouds_to_include
+                if cloud not in clouds_to_exclude
+            ]
+            function_name = (f'{class_name}::{node.name}'
+                             if class_name else node.name)
+            function_cloud_map[function_name] = (clouds_to_include)
+    return function_cloud_map
+
+
+def _generate_pipeline(test_file: str) -> Dict[str, Any]:
+    """Generate a Buildkite pipeline from test files."""
+    steps = []
+    function_cloud_map = _extract_marked_tests(test_file)
+    for test_function, clouds in function_cloud_map.items():
+        for cloud in clouds:
+            step = {
+                'label': f'{test_function} on {cloud}',
+                'command': f'pytest {test_file}::{test_function} --{cloud}',
+                'env': {
+                    'LOG_TO_STDOUT': '1'
+                }
+            }
+            steps.append(step)
+            # we only run one cloud per test function for now
+            break
+    return {'steps': steps}
+
+
+def main():
+    # List of test files to include in the pipeline
+    test_files = os.listdir('tests/smoke_tests')
+
+    for test_file in test_files:
+        if not test_file.startswith('test_'):
+            continue
+        test_file_path = os.path.join('tests/smoke_tests', test_file)
+        pipeline = _generate_pipeline(test_file_path)
+        yaml_file_path = '.buildkite/pipeline_smoke_' + \
+            f'{test_file.split(".")[0]}.yaml'
+        with open(yaml_file_path, 'w', encoding='utf-8') as file:
+            file.write('# This is an auto-generated Buildkite pipeline by '
+                       '.buildkite/generate_pipeline.py, Please do not '
+                       'edit directly.\n')
+            yaml.dump(pipeline, file, default_flow_style=False)
+        print(f'Convert {test_file_path} to {yaml_file_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
deleted file mode 100644
index 4d8ed6ff8f0..00000000000
--- a/.buildkite/pipeline.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-steps:
-  - label: "test_minimal"
-    command: "pytest tests/smoke_tests/test_basic.py::test_minimal"
-    env:
-      LOG_TO_STDOUT: "1"
-
-  - label: "test_launch_fast"
-    command: "pytest tests/smoke_tests/test_basic.py::test_launch_fast"
-    env:
-      LOG_TO_STDOUT: "1"
diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml
new file mode 100644
index 00000000000..9c775c1f5fb
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_basic.yaml
@@ -0,0 +1,91 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_example_app on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_minimal on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_launch_fast on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_launch_fast_with_autostop on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_stale_job on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_stale_job_manual_restart on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_stale_job_manual_restart on gcp
+- command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_env_check on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cli_logs on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_scp_logs --scp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_scp_logs on scp
+- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_core_api_sky_launch_exec on gcp
+- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_core_api_sky_launch_fast on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multiple_accelerators_ordered on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multiple_accelerators_ordered_with_default on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multiple_accelerators_unordered on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multiple_accelerators_unordered_with_default on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multiple_resources on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_sky_bench on aws
+- command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_kubernetes_context_failover on kubernetes
+- command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml
new file mode 100644
index 00000000000..3b81274a00a
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_cluster_job.yaml
@@ -0,0 +1,265 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_job_queue on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_job_queue_with_docker on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --lambda_cloud
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_lambda_job_queue on lambda_cloud
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_ibm_job_queue on ibm
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_job_queue --scp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_scp_job_queue on scp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_job_queue_multinode on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_large_job_queue on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_fast_large_job_queue on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue_multinode
+    --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_ibm_job_queue_multinode on ibm
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_docker_preinstalled_package on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multi_echo on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_huggingface on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --lambda_cloud
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_lambda_huggingface on lambda_cloud
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_huggingface --scp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_scp_huggingface on scp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_inferentia on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_tpu on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_tpu_vm on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_tpu_vm_pod on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_tpu_pod_slice_gke on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multi_hostname on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_multi_node_failure on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_http_server_with_custom_ports on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_http_server_with_custom_ports on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+    --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_http_server_with_custom_ports on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_kubernetes_http_server_with_custom_ports on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_paperspace_http_server_with_custom_ports
+    --paperspace
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_paperspace_http_server_with_custom_ports on paperspace
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports
+    --runpod
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_runpod_http_server_with_custom_ports on runpod
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_task_labels_aws on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_task_labels_gcp on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_task_labels_kubernetes on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_container_logs_multinode_kubernetes on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_container_logs_two_jobs_kubernetes on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_distributed_tf on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_start_stop on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_start_stop on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_autostop on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_autodown on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_autodown --scp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_scp_autodown on scp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cancel_aws on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cancel_gcp on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cancel_azure on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cancel_pytorch on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_ibm --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cancel_ibm on ibm
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_use_spot on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_stop_gcp_spot on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_inline_env on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_inline_env_file on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_custom_image on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_kubernetes_custom_image on kubernetes
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+    --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_start_stop_two_nodes on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_disk_tier on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_disk_tier on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_disk_tier on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+    --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_best_tier_failover on azure
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_zero_quota_failover on aws
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_zero_quota_failover on gcp
+- command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_long_setup_run_script on aws
diff --git a/.buildkite/pipeline_smoke_test_images.yaml b/.buildkite/pipeline_smoke_test_images.yaml
new file mode 100644
index 00000000000..4991fccbbc7
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_images.yaml
@@ -0,0 +1,66 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_images on aws
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_images on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_images on azure
+- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_image_id_dict on aws
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_image_id_dict on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_image_id_dict_region on aws
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_image_id_dict_region on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_image_id_dict_zone on aws
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_image_id_dict_zone on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_clone_disk_aws on aws
+- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_clone_disk_gcp on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_mig on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_force_enable_external_ips on gcp
+- command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_image_no_conda on aws
+- command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_custom_default_conda_env on aws
diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml
new file mode 100644
index 00000000000..cda2b87a53c
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_managed_job.yaml
@@ -0,0 +1,79 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_job_pipeline on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+    --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_failed_setup on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+    --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_pipeline_failed_setup on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_recovery_aws on aws
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_recovery_gcp on gcp
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_pipeline_recovery_aws on aws
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_pipeline_recovery_gcp on gcp
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+    --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_recovery_default_resources on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_recovery_multi_node_aws on aws
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_recovery_multi_node_gcp on gcp
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_cancellation_aws on aws
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_cancellation_gcp on gcp
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+    --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_storage on managed_jobs
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_tpu on gcp
+- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+    --managed_jobs
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_managed_jobs_inline_env on managed_jobs
diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
new file mode 100644
index 00000000000..6f1d11e7804
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
@@ -0,0 +1,139 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_file_mounts on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_scp_file_mounts
+    --scp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_scp_file_mounts on scp
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_using_file_mounts_with_env_vars on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_storage_mounts_with_stop on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_storage_mounts_with_stop on gcp
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+    --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_storage_mounts_with_stop on azure
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_kubernetes_storage_mounts on kubernetes
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_kubernetes_context_switch on kubernetes
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_docker_storage_mounts on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_cloudflare_storage_mounts
+    --cloudflare
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_cloudflare_storage_mounts on cloudflare
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_ibm_storage_mounts
+    --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_ibm_storage_mounts on ibm
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
+    aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_public_bucket on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_private_bucket on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_list_source on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_invalid_names on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_aws_regions on aws
+- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: TestStorageWithCredentials::test_gcs_regions on aws
diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml
new file mode 100644
index 00000000000..ae38eb4b594
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_region_and_zone.yaml
@@ -0,0 +1,36 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_region on aws
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_with_ssh_proxy_command on aws
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_region_and_service_account on gcp
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_region --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_ibm_region on ibm
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_azure_region on azure
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_aws_zone on aws
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_zone --ibm
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_ibm_zone on ibm
+- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_gcp_zone on gcp
diff --git a/.buildkite/pipeline_smoke_test_required_before_merge.yaml b/.buildkite/pipeline_smoke_test_required_before_merge.yaml
new file mode 100644
index 00000000000..8a29f838e4e
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_required_before_merge.yaml
@@ -0,0 +1,7 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+    --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml
new file mode 100644
index 00000000000..0fd84641780
--- /dev/null
+++ b/.buildkite/pipeline_smoke_test_sky_serve.yaml
@@ -0,0 +1,106 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_gcp_http on gcp
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_aws_http on aws
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_azure_http on azure
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+    --kubernetes
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_kubernetes_http on kubernetes
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_oci_http --oci
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_oci_http on oci
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_llm on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_spot_recovery on gcp
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_base_ondemand_fallback on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_dynamic_ondemand_fallback on gcp
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_user_bug_restart on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_load_balancer on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+    --gcp
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_auto_restart on gcp
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_cancel on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_streaming on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_readiness_timeout_fail on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_large_readiness_timeout on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_update on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_rolling_update on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_fast_update on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_update_autoscale on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
+    --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_new_autoscaler_update on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --serve
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_skyserve_failures on serve
+- command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+  env:
+    LOG_TO_STDOUT: '1'
+  label: test_user_dependencies on aws
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index c0996e135d0..0090ae957b8 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -1,39 +1,45 @@
-# Smoke tests for SkyPilot
+# Smoke tests for SkyPilot for basic functionality
 # Default options are set in pyproject.toml
 # Example usage:
 # Run all tests except for AWS and Lambda Cloud
-# > pytest tests/test_smoke.py
+# > pytest tests/smoke_tests/test_basic.py
 #
 # Terminate failed clusters after test finishes
-# > pytest tests/test_smoke.py --terminate-on-failure
+# > pytest tests/smoke_tests/test_basic.py --terminate-on-failure
 #
 # Re-run last failed tests
 # > pytest --lf
 #
 # Run one of the smoke tests
-# > pytest tests/test_smoke.py::test_minimal
-#
-# Only run managed job tests
-# > pytest tests/test_smoke.py --managed-jobs
-#
-# Only run sky serve tests
-# > pytest tests/test_smoke.py --sky-serve
+# > pytest tests/smoke_tests/test_basic.py::test_minimal
 #
 # Only run test for AWS + generic tests
-# > pytest tests/test_smoke.py --aws
+# > pytest tests/smoke_tests/test_basic.py --aws
 #
 # Change cloud for generic tests to aws
-# > pytest tests/test_smoke.py --generic-cloud aws
+# > pytest tests/smoke_tests/test_basic.py --generic-cloud aws
+
+import pathlib
+import subprocess
+import tempfile
+import textwrap
+import time
 
 import pytest
-from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import _get_timeout
-from smoke_tests.util import _VALIDATE_LAUNCH_OUTPUT
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_timeout
 from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_TYPE
 from smoke_tests.util import Test
+from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT
+from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB
 
+import sky
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
 from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
 
 
 # ---------- Dry run: 2 Tasks in a chain. ----------
@@ -48,16 +54,16 @@ def test_example_app():
 
 # ---------- A minimal task ----------
 def test_minimal(generic_cloud: str):
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'minimal',
         [
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             # Output validation done.
             f'sky logs {name} 1 --status',
             f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
             # Test launch output again on existing cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 2 --status',
             f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
             # Check the logs downloading
@@ -83,20 +89,20 @@ def test_minimal(generic_cloud: str):
             f'sky exec -c {name} && exit 1 || true',
         ],
         f'sky down -y {name}',
-        _get_timeout(generic_cloud),
+        get_timeout(generic_cloud),
     )
     run_one_test(test)
 
 
 # ---------- Test fast launch ----------
 def test_launch_fast(generic_cloud: str):
-    name = _get_cluster_name()
+    name = get_cluster_name()
 
     test = Test(
         'test_launch_fast',
         [
             # First launch to create the cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 1 --status',
 
             # Second launch to test fast launch - should not reprovision
@@ -112,7 +118,7 @@ def test_launch_fast(generic_cloud: str):
             f'sky status -r {name} | grep UP',
         ],
         f'sky down -y {name}',
-        timeout=_get_timeout(generic_cloud),
+        timeout=get_timeout(generic_cloud),
     )
     run_one_test(test)
 
@@ -123,7 +129,7 @@ def test_launch_fast(generic_cloud: str):
 @pytest.mark.no_ibm
 @pytest.mark.no_kubernetes
 def test_launch_fast_with_autostop(generic_cloud: str):
-    name = _get_cluster_name()
+    name = get_cluster_name()
     # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
     # the VM is stopped.
     autostop_timeout = 600 if generic_cloud == 'azure' else 250
@@ -131,22 +137,471 @@ def test_launch_fast_with_autostop(generic_cloud: str):
         'test_launch_fast_with_autostop',
         [
             # First launch to create the cluster with a short autostop
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 1 --status',
             f'sky status -r {name} | grep UP',
 
             # Ensure cluster is stopped
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
                 cluster_name=name,
                 cluster_status=ClusterStatus.STOPPED.value,
                 timeout=autostop_timeout),
 
             # Launch again. Do full output validation - we expect the cluster to re-launch
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 2 --status',
             f'sky status -r {name} | grep UP',
         ],
         f'sky down -y {name}',
-        timeout=_get_timeout(generic_cloud) + autostop_timeout,
+        timeout=get_timeout(generic_cloud) + autostop_timeout,
+    )
+    run_one_test(test)
+
+
+# ------------ Test stale job ------------
+@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
+@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
+def test_stale_job(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'stale_job',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
+            f'sky exec {name} -d "echo start; sleep 10000"',
+            f'sky stop {name} -y',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=100),
+            f'sky start {name} -y',
+            f'sky logs {name} 1 --status',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_stale_job_manual_restart():
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.AWS.max_cluster_name_length())
+    region = 'us-east-2'
+    test = Test(
+        'aws_stale_job_manual_restart',
+        [
+            f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"',
+            f'sky exec {name} -d "echo start; sleep 10000"',
+            # Stop the cluster manually.
+            f'id=`aws ec2 describe-instances --region {region} --filters '
+            f'Name=tag:ray-cluster-name,Values={name_on_cloud} '
+            f'--query Reservations[].Instances[].InstanceId '
+            '--output text`; '
+            f'aws ec2 stop-instances --region {region} '
+            '--instance-ids $id',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
+            f'sky launch -c {name} -y "echo hi"',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 3 --status',
+            # Ensure the skylet updated the stale job status.
+            WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED_DRIVER.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_stale_job_manual_restart():
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.GCP.max_cluster_name_length())
+    zone = 'us-west2-a'
+    query_cmd = (f'gcloud compute instances list --filter='
+                 f'"(labels.ray-cluster-name={name_on_cloud})" '
+                 f'--zones={zone} --format="value(name)"')
+    stop_cmd = (f'gcloud compute instances stop --zone={zone}'
+                f' --quiet $({query_cmd})')
+    test = Test(
+        'gcp_stale_job_manual_restart',
+        [
+            f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"',
+            f'sky exec {name} -d "echo start; sleep 10000"',
+            # Stop the cluster manually.
+            stop_cmd,
+            'sleep 40',
+            f'sky launch -c {name} -y "echo hi"',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 3 --status',
+            # Ensure the skylet updated the stale job status.
+            WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED_DRIVER.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Check Sky's environment variables; workdir. ----------
+@pytest.mark.no_fluidstack  # Requires amazon S3
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+def test_env_check(generic_cloud: str):
+    name = get_cluster_name()
+    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
+    test = Test(
+        'env_check',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
+    )
+    run_one_test(test)
+
+
+# ---------- CLI logs ----------
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead.
+def test_cli_logs(generic_cloud: str):
+    name = get_cluster_name()
+    num_nodes = 2
+    if generic_cloud == 'kubernetes':
+        # Kubernetes does not support multi-node
+        num_nodes = 1
+    timestamp = time.time()
+    test = Test('cli_logs', [
+        f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"',
+        f'sky exec {name} "echo {timestamp} 2"',
+        f'sky exec {name} "echo {timestamp} 3"',
+        f'sky exec {name} "echo {timestamp} 4"',
+        f'sky logs {name} 2 --status',
+        f'sky logs {name} 3 4 --sync-down',
+        f'sky logs {name} * --sync-down',
+        f'sky logs {name} 1 | grep "{timestamp} 1"',
+        f'sky logs {name} | grep "{timestamp} 4"',
+    ], f'sky down -y {name}')
+    run_one_test(test)
+
+
+@pytest.mark.scp
+def test_scp_logs():
+    name = get_cluster_name()
+    timestamp = time.time()
+    test = Test(
+        'SCP_cli_logs',
+        [
+            f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"',
+            f'sky exec {name} "echo {timestamp} 2"',
+            f'sky exec {name} "echo {timestamp} 3"',
+            f'sky exec {name} "echo {timestamp} 4"',
+            f'sky logs {name} 2 --status',
+            f'sky logs {name} 3 4 --sync-down',
+            f'sky logs {name} * --sync-down',
+            f'sky logs {name} 1 | grep "{timestamp} 1"',
+            f'sky logs {name} | grep "{timestamp} 4"',
+        ],
+        f'sky down -y {name}',
     )
     run_one_test(test)
+
+
+# ------- Testing the core API --------
+# Most of the core APIs have been tested in the CLI tests.
+# These tests are for testing the return value of the APIs not fully used in CLI.
+
+
+@pytest.mark.gcp
+def test_core_api_sky_launch_exec():
+    name = get_cluster_name()
+    task = sky.Task(run="whoami")
+    task.set_resources(sky.Resources(cloud=sky.GCP()))
+    job_id, handle = sky.launch(task, cluster_name=name)
+    assert job_id == 1
+    assert handle is not None
+    assert handle.cluster_name == name
+    assert handle.launched_resources.cloud.is_same_cloud(sky.GCP())
+    job_id_exec, handle_exec = sky.exec(task, cluster_name=name)
+    assert job_id_exec == 2
+    assert handle_exec is not None
+    assert handle_exec.cluster_name == name
+    assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP())
+    # For dummy task (i.e. task.run is None), the job won't be submitted.
+    dummy_task = sky.Task()
+    job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name)
+    assert job_id_dummy is None
+    sky.down(name)
+
+
+# The sky launch CLI has some additional checks to make sure the cluster is up/
+# restarted. However, the core API doesn't have these; make sure it still works
+def test_core_api_sky_launch_fast(generic_cloud: str):
+    name = get_cluster_name()
+    cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)
+    try:
+        task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud))
+        sky.launch(task,
+                   cluster_name=name,
+                   idle_minutes_to_autostop=1,
+                   fast=True)
+        # Sleep to let the cluster autostop
+        WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            cluster_name=name,
+            cluster_status=ClusterStatus.STOPPED,
+            timeout=120)
+        # Run it again - should work with fast=True
+        sky.launch(task,
+                   cluster_name=name,
+                   idle_minutes_to_autostop=1,
+                   fast=True)
+    finally:
+        sky.down(name)
+
+
+# ---------- Testing YAML Specs ----------
+# Our sky storage requires credentials to check the bucket existance when
+# loading a task from the yaml file, so we cannot make it a unit test.
+class TestYamlSpecs:
+    # TODO(zhwu): Add test for `to_yaml_config` for the Storage object.
+    #  We should not use `examples/storage_demo.yaml` here, since it requires
+    #  users to ensure bucket names to not exist and/or be unique.
+    _TEST_YAML_PATHS = [
+        'examples/minimal.yaml', 'examples/managed_job.yaml',
+        'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml',
+        'examples/multi_hostname.yaml'
+    ]
+
+    def _is_dict_subset(self, d1, d2):
+        """Check if d1 is the subset of d2."""
+        for k, v in d1.items():
+            if k not in d2:
+                if isinstance(v, list) or isinstance(v, dict):
+                    assert len(v) == 0, (k, v)
+                else:
+                    assert False, (k, v)
+            elif isinstance(v, dict):
+                assert isinstance(d2[k], dict), (k, v, d2)
+                self._is_dict_subset(v, d2[k])
+            elif isinstance(v, str):
+                if k == 'accelerators':
+                    resources = sky.Resources()
+                    resources._set_accelerators(v, None)
+                    assert resources.accelerators == d2[k], (k, v, d2)
+                else:
+                    assert v.lower() == d2[k].lower(), (k, v, d2[k])
+            else:
+                assert v == d2[k], (k, v, d2[k])
+
+    def _check_equivalent(self, yaml_path):
+        """Check if the yaml is equivalent after load and dump again."""
+        origin_task_config = common_utils.read_yaml(yaml_path)
+
+        task = sky.Task.from_yaml(yaml_path)
+        new_task_config = task.to_yaml_config()
+        # d1 <= d2
+        print(origin_task_config, new_task_config)
+        self._is_dict_subset(origin_task_config, new_task_config)
+
+    def test_load_dump_yaml_config_equivalent(self):
+        """Test if the yaml config is equivalent after load and dump again."""
+        pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True)
+        pathlib.Path('~/tmpfile').expanduser().touch()
+        pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True)
+        pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch()
+        pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True)
+        pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True,
+                                                           exist_ok=True)
+        for yaml_path in self._TEST_YAML_PATHS:
+            self._check_equivalent(yaml_path)
+
+
+# ---------- Testing Multiple Accelerators ----------
+@pytest.mark.no_fluidstack  # Fluidstack does not support K80 gpus for now
+@pytest.mark.no_paperspace  # Paperspace does not support K80 gpus
+def test_multiple_accelerators_ordered():
+    name = get_cluster_name()
+    test = Test(
+        'multiple-accelerators-ordered',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
+@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
+def test_multiple_accelerators_ordered_with_default():
+    name = get_cluster_name()
+    test = Test(
+        'multiple-accelerators-ordered',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status {name} | grep Spot',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
+@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
+def test_multiple_accelerators_unordered():
+    name = get_cluster_name()
+    test = Test(
+        'multiple-accelerators-unordered',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
+@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
+def test_multiple_accelerators_unordered_with_default():
+    name = get_cluster_name()
+    test = Test(
+        'multiple-accelerators-unordered-with-default',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky status {name} | grep Spot',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
+def test_multiple_resources():
+    name = get_cluster_name()
+    test = Test(
+        'multiple-resources',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Sky Benchmark ----------
+@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
+@pytest.mark.no_paperspace  # Requires other clouds to be enabled
+@pytest.mark.no_kubernetes
+@pytest.mark.aws  # SkyBenchmark requires S3 access
+def test_sky_bench(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'sky-bench',
+        [
+            f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml',
+            'sleep 120',
+            f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED',
+        ],
+        f'sky bench down {name} -y; sky bench delete {name} -y',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_kubernetes_context_failover():
+    """Test if the kubernetes context failover works.
+
+    This test requires two kubernetes clusters:
+    - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs.
+    - another accessible cluster: with enough CPUs
+    To start the first cluster, run:
+      sky local up
+      # Add mock label for accelerator
+      kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot
+      # Get the token for the cluster in context kind-skypilot
+      TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\')
+      # Get the API URL for the cluster in context kind-skypilot
+      API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\')
+      # Add mock capacity for GPU
+      curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status"
+      # Add a new namespace to test the handling of namespaces
+      kubectl create namespace test-namespace --context kind-skypilot
+      # Set the namespace to test-namespace
+      kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot
+    """
+    # Get context that is not kind-skypilot
+    contexts = subprocess.check_output('kubectl config get-contexts -o name',
+                                       shell=True).decode('utf-8').split('\n')
+    context = [context for context in contexts if context != 'kind-skypilot'][0]
+    config = textwrap.dedent(f"""\
+    kubernetes:
+      allowed_contexts:
+        - kind-skypilot
+        - {context}
+    """)
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+        f.write(config.encode('utf-8'))
+        f.flush()
+        name = get_cluster_name()
+        test = Test(
+            'kubernetes-context-failover',
+            [
+                # Check if kind-skypilot is provisioned with H100 annotations already
+                'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && '
+                'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && '
+                'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || '
+                '{ echo "kind-skypilot does not exist '
+                'or does not have mock labels for GPUs. Check the instructions in '
+                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
+                # Check namespace for kind-skypilot is test-namespace
+                'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
+                '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
+                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
+                'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"',
+                # Get contexts and set current context to the other cluster that is not kind-skypilot
+                f'kubectl config use-context {context}',
+                # H100 should not in the current context
+                '! sky show-gpus --cloud kubernetes | grep H100',
+                f'sky launch -y -c {name}-1 --cpus 1 echo hi',
+                f'sky logs {name}-1 --status',
+                # It should be launched not on kind-skypilot
+                f'sky status -a {name}-1 | grep "{context}"',
+                # Test failure for launching H100 on other cluster
+                f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true',
+                # Test failover
+                f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi',
+                f'sky logs {name}-3 --status',
+                # Test pods
+                f'kubectl get pods --context kind-skypilot | grep "{name}-3"',
+                # It should be launched on kind-skypilot
+                f'sky status -a {name}-3 | grep "kind-skypilot"',
+                # Should be 7 free GPUs
+                f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "  7"',
+                # Remove the line with "kind-skypilot"
+                f'sed -i "/kind-skypilot/d" {f.name}',
+                # Should still be able to exec and launch on existing cluster
+                f'sky exec {name}-3 "echo hi"',
+                f'sky logs {name}-3 --status',
+                f'sky status -r {name}-3 | grep UP',
+                f'sky launch -c {name}-3 --gpus h100 echo hi',
+                f'sky logs {name}-3 --status',
+                f'sky status -r {name}-3 | grep UP',
+            ],
+            f'sky down -y {name}-1 {name}-3',
+            env={'SKYPILOT_CONFIG': f.name},
+        )
+        run_one_test(test)
diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
new file mode 100644
index 00000000000..22b6d9dc8f0
--- /dev/null
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -0,0 +1,1657 @@
+# Smoke tests for SkyPilot for sky launched cluster and cluster job
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_cluster_job.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_cluster_job.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_cluster_job.py::test_job_queue
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_cluster_job.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_cluster_job.py --generic-cloud aws
+
+import pathlib
+import tempfile
+import textwrap
+
+import jinja2
+import pytest
+from smoke_tests.util import BUMP_UP_SECONDS
+from smoke_tests.util import get_aws_region_for_quota_failover
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_gcp_region_for_quota_failover
+from smoke_tests.util import get_timeout
+from smoke_tests.util import LAMBDA_TYPE
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_GPU_V100
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import Test
+from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+
+import sky
+from sky import AWS
+from sky import Azure
+from sky import GCP
+from sky.skylet import constants
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
+from sky.utils import common_utils
+from sky.utils import resources_utils
+
+
+# ---------- Job Queue. ----------
+@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
+@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead
+@pytest.mark.no_scp  # SCP does not have T4 gpus. Run test_scp_job_queue instead
+@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
+@pytest.mark.no_oci  # OCI does not have T4 gpus
+def test_job_queue(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'job_queue',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml',
+            f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
+            f'sky cancel -y {name} 2',
+            'sleep 5',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 3',
+            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky logs {name} 4 --status',
+            f'sky logs {name} 5 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Job Queue with Docker. ----------
+@pytest.mark.no_fluidstack  # FluidStack does not support docker for now
+@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
+@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
+@pytest.mark.no_paperspace  # Paperspace doesn't have T4 GPUs
+@pytest.mark.no_scp  # Doesn't support SCP for now
+@pytest.mark.no_oci  # Doesn't support OCI for now
+@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
+        # Test latest image with python 3.11 installed by default.
+        'docker:continuumio/miniconda3:24.1.2-0',
+        # Test python>=3.12 where SkyPilot should automatically create a separate
+        # conda env for runtime with python 3.10.
+        'docker:continuumio/miniconda3:latest',
+        # Axolotl image is a good example custom image that has its conda path
+        # set in PATH with dockerfile and uses python>=3.12. It could test:
+        #  1. we handle the env var set in dockerfile correctly
+        #  2. python>=3.12 works with SkyPilot runtime.
+        'docker:winglian/axolotl:main-latest'
+    ])
+def test_job_queue_with_docker(generic_cloud: str, image_id: str):
+    name = get_cluster_name() + image_id[len('docker:'):][:4]
+    total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
+    time_to_sleep = 300 if generic_cloud == 'azure' else 180
+    test = Test(
+        'job_queue_with_docker',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
+            f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
+            f'sky cancel -y {name} 2',
+            'sleep 5',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 3',
+            # Make sure the GPU is still visible to the container.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 4 --status',
+            f'sky stop -y {name}',
+            # Make sure the job status preserve after stop and start the
+            # cluster. This is also a test for the docker container to be
+            # preserved after stop and start.
+            f'sky start -y {name}',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
+            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            # Make sure it is still visible after an stop & start cycle.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 7 --status'
+        ],
+        f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.lambda_cloud
+def test_lambda_job_queue():
+    name = get_cluster_name()
+    test = Test(
+        'lambda_job_queue',
+        [
+            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml',
+            f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml',
+            f'sky queue {name} | grep {name}-1 | grep RUNNING',
+            f'sky queue {name} | grep {name}-2 | grep RUNNING',
+            f'sky queue {name} | grep {name}-3 | grep PENDING',
+            f'sky cancel -y {name} 2',
+            'sleep 5',
+            f'sky queue {name} | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 3',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.ibm
+def test_ibm_job_queue():
+    name = get_cluster_name()
+    test = Test(
+        'ibm_job_queue',
+        [
+            f'sky launch -y -c {name} --cloud ibm --gpus v100',
+            f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml',
+            f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml',
+            f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml',
+            f'sky queue {name} | grep {name}-1 | grep RUNNING',
+            f'sky queue {name} | grep {name}-2 | grep RUNNING',
+            f'sky queue {name} | grep {name}-3 | grep PENDING',
+            f'sky cancel -y {name} 2',
+            'sleep 5',
+            f'sky queue {name} | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 3',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.scp
+def test_scp_job_queue():
+    name = get_cluster_name()
+    num_of_gpu_launch = 1
+    num_of_gpu_exec = 0.5
+    test = Test(
+        'SCP_job_queue',
+        [
+            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml',
+            f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky queue {name} | grep {name}-1 | grep RUNNING',
+            f'sky queue {name} | grep {name}-2 | grep RUNNING',
+            f'sky queue {name} | grep {name}-3 | grep PENDING',
+            f'sky cancel -y {name} 2',
+            'sleep 5',
+            f'sky queue {name} | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 3',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
+@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead
+@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus.
+@pytest.mark.no_kubernetes  # Kubernetes not support num_nodes > 1 yet
+def test_job_queue_multinode(generic_cloud: str):
+    name = get_cluster_name()
+    total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
+    test = Test(
+        'job_queue_multinode',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml',
+            f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml',
+            f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml',
+            f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml',
+            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
+            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
+            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
+            'sleep 90',
+            f'sky cancel -y {name} 1',
+            'sleep 5',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP',
+            f'sky cancel -y {name} 1 2 3',
+            f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml',
+            # Test the job status is correctly set to SETTING_UP, during the setup is running,
+            # and the job can be cancelled during the setup.
+            'sleep 5',
+            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
+            f'sky cancel -y {name} 4',
+            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
+            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
+@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
+def test_large_job_queue(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'large_job_queue',
+        [
+            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
+            f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done',
+            f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16',
+            'sleep 90',
+
+            # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running.
+            # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING.
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43',
+            # Make sure the jobs are scheduled in FIFO order
+            *[
+                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
+                for i in range(1, 17)
+            ],
+            *[
+                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
+                for i in range(17, 33)
+            ],
+            *[
+                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING'
+                for i in range(33, 75)
+            ],
+            f'sky cancel -y {name} 33 35 37 39 17 18 19',
+            *[
+                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
+                for i in range(33, 40, 2)
+            ],
+            'sleep 10',
+            *[
+                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
+                for i in [34, 36, 38]
+            ],
+        ],
+        f'sky down -y {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
+@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
+def test_fast_large_job_queue(generic_cloud: str):
+    # This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
+    name = get_cluster_name()
+    test = Test(
+        'fast_large_job_queue',
+        [
+            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
+            f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done',
+            'sleep 60',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32',
+        ],
+        f'sky down -y {name}',
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.ibm
+def test_ibm_job_queue_multinode():
+    name = get_cluster_name()
+    task_file = 'examples/job_queue/job_multinode_ibm.yaml'
+    test = Test(
+        'ibm_job_queue_multinode',
+        [
+            f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2',
+            f'sky exec {name} -n {name}-1 -d {task_file}',
+            f'sky exec {name} -n {name}-2 -d {task_file}',
+            f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}',
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)',
+            'sleep 90',
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
+            f'sky cancel -y {name} 1',
+            'sleep 5',
+            f'sky queue {name} | grep {name}-3 | grep RUNNING',
+            f'sky cancel -y {name} 1 2 3',
+            f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}',
+            # Test the job status is correctly set to SETTING_UP, during the setup is running,
+            # and the job can be cancelled during the setup.
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
+            f'sky cancel -y {name} 4',
+            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
+            f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
+            f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            f'sky logs {name} 7 --status',
+        ],
+        f'sky down -y {name}',
+        timeout=20 * 60,  # 20 mins
+    )
+    run_one_test(test)
+
+
+# ---------- Docker with preinstalled package. ----------
+@pytest.mark.no_fluidstack  # Doesn't support Fluidstack for now
+@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
+@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
+@pytest.mark.no_scp  # Doesn't support SCP for now
+@pytest.mark.no_oci  # Doesn't support OCI for now
+@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
+# TODO(zhwu): we should fix this for kubernetes
+def test_docker_preinstalled_package(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'docker_with_preinstalled_package',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx',
+            f'sky exec {name} "nginx -V"',
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} whoami | grep root',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Submitting multiple tasks to the same cluster. ----------
+@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
+@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus
+@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus
+def test_multi_echo(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'multi_echo',
+        [
+            f'python examples/multi_echo.py {name} {generic_cloud}',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 10',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 30',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 30',
+            # Make sure that our job scheduler is fast enough to have at least
+            # 10 RUNNING jobs in parallel.
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'',
+            'sleep 30',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done',
+        ] +
+        # Ensure jobs succeeded.
+        [
+            WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+                cluster_name=name,
+                job_id=i + 1,
+                job_status=JobStatus.SUCCEEDED.value,
+                timeout=120) for i in range(32)
+        ] +
+        # Ensure monitor/autoscaler didn't crash on the 'assert not
+        # unfulfilled' error.  If process not found, grep->ssh returns 1.
+        [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
+        f'sky down -y {name}',
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# ---------- Task: 1 node training. ----------
+@pytest.mark.no_fluidstack  # Fluidstack does not have T4 gpus for now
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
+@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
+@pytest.mark.no_scp  # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
+def test_huggingface(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'huggingface_glue_imdb_app',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.lambda_cloud
+def test_lambda_huggingface(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'lambda_huggingface_glue_imdb_app',
+        [
+            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.scp
+def test_scp_huggingface(generic_cloud: str):
+    name = get_cluster_name()
+    num_of_gpu_launch = 1
+    test = Test(
+        'SCP_huggingface_glue_imdb_app',
+        [
+            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Inferentia. ----------
+@pytest.mark.aws
+def test_inferentia():
+    name = get_cluster_name()
+    test = Test(
+        'test_inferentia',
+        [
+            f'sky launch -y -c {name} -t inf2.xlarge -- echo hi',
+            f'sky exec {name} --gpus Inferentia:1 echo hi',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- TPU. ----------
+@pytest.mark.gcp
+@pytest.mark.tpu
+def test_tpu():
+    name = get_cluster_name()
+    test = Test(
+        'tpu_app',
+        [
+            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml',
+            f'sky logs {name} 1',  # Ensure the job finished.
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"',  # Ensure sky launch won't create another TPU.
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # can take >20 mins
+    )
+    run_one_test(test)
+
+
+# ---------- TPU VM. ----------
+@pytest.mark.gcp
+@pytest.mark.tpu
+def test_tpu_vm():
+    name = get_cluster_name()
+    test = Test(
+        'tpu_vm_app',
+        [
+            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml',
+            f'sky logs {name} 1',  # Ensure the job finished.
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky stop -y {name}',
+            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',  # Ensure the cluster is STOPPED.
+            # Use retry: guard against transient errors observed for
+            # just-stopped TPU VMs (#962).
+            f'sky start --retry-until-up -y {name}',
+            f'sky exec {name} examples/tpu/tpuvm_mnist.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky stop -y {name}',
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # can take 30 mins
+    )
+    run_one_test(test)
+
+
+# ---------- TPU VM Pod. ----------
+@pytest.mark.gcp
+@pytest.mark.tpu
+def test_tpu_vm_pod():
+    name = get_cluster_name()
+    test = Test(
+        'tpu_pod',
+        [
+            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a',
+            f'sky logs {name} 1',  # Ensure the job finished.
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # can take 30 mins
+    )
+    run_one_test(test)
+
+
+# ---------- TPU Pod Slice on GKE. ----------
+@pytest.mark.kubernetes
+def test_tpu_pod_slice_gke():
+    name = get_cluster_name()
+    test = Test(
+        'tpu_pod_slice_gke',
+        [
+            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
+            f'sky logs {name} 1',  # Ensure the job finished.
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"',  # Ensure TPU is reachable.
+            f'sky logs {name} 2 --status'
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # can take 30 mins
+    )
+    run_one_test(test)
+
+
+# ---------- Simple apps. ----------
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+def test_multi_hostname(generic_cloud: str):
+    name = get_cluster_name()
+    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
+    test = Test(
+        'multi_hostname',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2',  # Ensure there are 2 hosts.
+            f'sky exec {name} examples/multi_hostname.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=get_timeout(generic_cloud, total_timeout_minutes * 60),
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+def test_multi_node_failure(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'multi_node_failure',
+        [
+            # TODO(zhwu): we use multi-thread to run the commands in setup
+            # commands in parallel, which makes it impossible to fail fast
+            # when one of the nodes fails. We should fix this in the future.
+            # The --detach-setup version can fail fast, as the setup is
+            # submitted to the remote machine, which does not use multi-thread.
+            # Refer to the comment in `subprocess_utils.run_in_parallel`.
+            # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1',  # Ensure the job setup failed.
+            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml',
+            f'sky logs {name} 1 --status | grep FAILED_SETUP',  # Ensure the job setup failed.
+            f'sky exec {name} tests/test_yamls/failed_worker_run.yaml',
+            f'sky logs {name} 2 --status | grep FAILED',  # Ensure the job failed.
+            f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2',  # Ensure there 2 of the hosts printed their hostname.
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on GCP. ----------
+@pytest.mark.gcp
+def test_gcp_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'gcp_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on AWS. ----------
+@pytest.mark.aws
+def test_aws_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'aws_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on Azure. ----------
+@pytest.mark.azure
+def test_azure_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'azure_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on Kubernetes. ----------
+@pytest.mark.kubernetes
+def test_kubernetes_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'kubernetes_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi'
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on Paperspace. ----------
+@pytest.mark.paperspace
+def test_paperspace_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'paperspace_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Web apps with custom ports on RunPod. ----------
+@pytest.mark.runpod
+def test_runpod_http_server_with_custom_ports():
+    name = get_cluster_name()
+    test = Test(
+        'runpod_http_server_with_custom_ports',
+        [
+            f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml',
+            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
+            # Retry a few times to avoid flakiness in ports being open.
+            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Labels from task on AWS (instance_tags) ----------
+@pytest.mark.aws
+def test_task_labels_aws():
+    name = get_cluster_name()
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_labels.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(cloud='aws', region='us-east-1')
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test = Test(
+            'task_labels_aws',
+            [
+                f'sky launch -y -c {name} {file_path}',
+                # Verify with aws cli that the tags are set.
+                'aws ec2 describe-instances '
+                '--query "Reservations[*].Instances[*].InstanceId" '
+                '--filters "Name=instance-state-name,Values=running" '
+                f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" '
+                '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" '
+                '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" '
+                '--region us-east-1 --output text',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+# ---------- Labels from task on GCP (labels) ----------
+@pytest.mark.gcp
+def test_task_labels_gcp():
+    name = get_cluster_name()
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_labels.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(cloud='gcp')
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test = Test(
+            'task_labels_gcp',
+            [
+                f'sky launch -y -c {name} {file_path}',
+                # Verify with gcloud cli that the tags are set
+                f'gcloud compute instances list --filter="name~\'^{name}\' AND '
+                'labels.inlinelabel1=\'inlinevalue1\' AND '
+                'labels.inlinelabel2=\'inlinevalue2\'" '
+                '--format="value(name)" | grep .',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+# ---------- Labels from task on Kubernetes (labels) ----------
+@pytest.mark.kubernetes
+def test_task_labels_kubernetes():
+    name = get_cluster_name()
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_labels.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(cloud='kubernetes')
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test = Test(
+            'task_labels_kubernetes',
+            [
+                f'sky launch -y -c {name} {file_path}',
+                # Verify with kubectl that the labels are set.
+                'kubectl get pods '
+                '--selector inlinelabel1=inlinevalue1 '
+                '--selector inlinelabel2=inlinevalue2 '
+                '-o jsonpath=\'{.items[*].metadata.name}\' | '
+                f'grep \'^{name}\''
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+# ---------- Pod Annotations on Kubernetes ----------
+@pytest.mark.kubernetes
+def test_add_pod_annotations_for_autodown_with_launch():
+    name = get_cluster_name()
+    test = Test(
+        'add_pod_annotations_for_autodown_with_launch',
+        [
+            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
+            # Autodown is set.
+            f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes',
+            # Get names of the pods containing cluster name.
+            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
+            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
+            # Describe the first pod and check for annotations.
+            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
+            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            # Describe the second pod and check for annotations.
+            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
+            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop'
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_add_and_remove_pod_annotations_with_autostop():
+    name = get_cluster_name()
+    test = Test(
+        'add_and_remove_pod_annotations_with_autostop',
+        [
+            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
+            f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes',
+            # Set autodown on the cluster with 'autostop' command.
+            f'sky autostop -y {name} -i 20 --down',
+            # Get names of the pods containing cluster name.
+            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
+            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
+            # Describe the first pod and check for annotations.
+            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
+            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            # Describe the second pod and check for annotations.
+            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
+            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
+            # Cancel the set autodown to remove the annotations from the pods.
+            f'sky autostop -y {name} --cancel',
+            # Describe the first pod and check if annotations are removed.
+            '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
+            '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
+            # Describe the second pod and check if annotations are removed.
+            '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
+            '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Container logs from task on Kubernetes ----------
+@pytest.mark.kubernetes
+def test_container_logs_multinode_kubernetes():
+    name = get_cluster_name()
+    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
+    head_logs = ('kubectl get pods '
+                 f' | grep {name} |  grep head | '
+                 " awk '{print $1}' | xargs -I {} kubectl logs {}")
+    worker_logs = ('kubectl get pods '
+                   f' | grep {name} |  grep worker |'
+                   " awk '{print $1}' | xargs -I {} kubectl logs {}")
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        test = Test(
+            'container_logs_multinode_kubernetes',
+            [
+                f'sky launch -y -c {name} {task_yaml} --num-nodes 2',
+                f'{head_logs} | wc -l | grep 9',
+                f'{worker_logs} | wc -l | grep 9',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_container_logs_two_jobs_kubernetes():
+    name = get_cluster_name()
+    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
+    pod_logs = ('kubectl get pods '
+                f' | grep {name} |  grep head |'
+                " awk '{print $1}' | xargs -I {} kubectl logs {}")
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        test = Test(
+            'test_container_logs_two_jobs_kubernetes',
+            [
+                f'sky launch -y -c {name} {task_yaml}',
+                f'{pod_logs} | wc -l | grep 9',
+                f'sky launch -y -c {name} {task_yaml}',
+                f'{pod_logs} | wc -l | grep 18',
+                f'{pod_logs} | grep 1 | wc -l | grep 2',
+                f'{pod_logs} | grep 2 | wc -l | grep 2',
+                f'{pod_logs} | grep 3 | wc -l | grep 2',
+                f'{pod_logs} | grep 4 | wc -l | grep 2',
+                f'{pod_logs} | grep 5 | wc -l | grep 2',
+                f'{pod_logs} | grep 6 | wc -l | grep 2',
+                f'{pod_logs} | grep 7 | wc -l | grep 2',
+                f'{pod_logs} | grep 8 | wc -l | grep 2',
+                f'{pod_logs} | grep 9 | wc -l | grep 2',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_container_logs_two_simultaneous_jobs_kubernetes():
+    name = get_cluster_name()
+    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml '
+    pod_logs = ('kubectl get pods '
+                f' | grep {name} |  grep head |'
+                " awk '{print $1}' | xargs -I {} kubectl logs {}")
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        test = Test(
+            'test_container_logs_two_simultaneous_jobs_kubernetes',
+            [
+                f'sky launch -y -c {name}',
+                f'sky exec -c {name} -d {task_yaml}',
+                f'sky exec -c {name} -d {task_yaml}',
+                'sleep 30',
+                f'{pod_logs} | wc -l | grep 18',
+                f'{pod_logs} | grep 1 | wc -l | grep 2',
+                f'{pod_logs} | grep 2 | wc -l | grep 2',
+                f'{pod_logs} | grep 3 | wc -l | grep 2',
+                f'{pod_logs} | grep 4 | wc -l | grep 2',
+                f'{pod_logs} | grep 5 | wc -l | grep 2',
+                f'{pod_logs} | grep 6 | wc -l | grep 2',
+                f'{pod_logs} | grep 7 | wc -l | grep 2',
+                f'{pod_logs} | grep 8 | wc -l | grep 2',
+                f'{pod_logs} | grep 9 | wc -l | grep 2',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
+# ---------- Task: n=2 nodes with setups. ----------
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
+@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+@pytest.mark.skip(
+    reason=
+    'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.')
+def test_distributed_tf(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'resnet_distributed_tf_app',
+        [
+            # NOTE: running it twice will hang (sometimes?) - an app-level bug.
+            f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=25 * 60,  # 25 mins (it takes around ~19 mins)
+    )
+    run_one_test(test)
+
+
+# ---------- Testing GCP start and stop instances ----------
+@pytest.mark.gcp
+def test_gcp_start_stop():
+    name = get_cluster_name()
+    test = Test(
+        'gcp-start-stop',
+        [
+            f'sky launch -y -c {name} examples/gcp_start_stop.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} examples/gcp_start_stop.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+            f'sky stop -y {name}',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
+            f'sky start -y {name} -i 1',
+            f'sky exec {name} examples/gcp_start_stop.yaml',
+            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=200),
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Testing Azure start and stop instances ----------
+@pytest.mark.azure
+def test_azure_start_stop():
+    name = get_cluster_name()
+    test = Test(
+        'azure-start-stop',
+        [
+            f'sky launch -y -c {name} examples/azure_start_stop.yaml',
+            f'sky exec {name} examples/azure_start_stop.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky stop -y {name}',
+            f'sky start -y {name} -i 1',
+            f'sky exec {name} examples/azure_start_stop.yaml',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=280) +
+            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # 30 mins
+    )
+    run_one_test(test)
+
+
+# ---------- Testing Autostopping ----------
+@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
+@pytest.mark.no_ibm  # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+@pytest.mark.no_kubernetes  # Kubernetes does not autostop yet
+def test_autostop(generic_cloud: str):
+    name = get_cluster_name()
+    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
+    # the VM is stopped.
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    # Launching and starting Azure clusters can take a long time too. e.g., restart
+    # a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
+    total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
+    test = Test(
+        'autostop',
+        [
+            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
+            f'sky autostop -y {name} -i 1',
+
+            # Ensure autostop is set.
+            f'sky status | grep {name} | grep "1m"',
+
+            # Ensure the cluster is not stopped early.
+            'sleep 40',
+            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
+
+            # Ensure the cluster is STOPPED.
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
+
+            # Ensure the cluster is UP and the autostop setting is reset ('-').
+            f'sky start -y {name}',
+            f'sky status | grep {name} | grep -E "UP\s+-"',
+
+            # Ensure the job succeeded.
+            f'sky exec {name} tests/test_yamls/minimal.yaml',
+            f'sky logs {name} 2 --status',
+
+            # Test restarting the idleness timer via reset:
+            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
+            'sleep 40',  # Almost reached the threshold.
+            f'sky autostop -y {name} -i 1',  # Should restart the timer.
+            'sleep 40',
+            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
+
+            # Test restarting the idleness timer via exec:
+            f'sky start -y {name}',
+            f'sky status | grep {name} | grep -E "UP\s+-"',
+            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
+            'sleep 45',  # Almost reached the threshold.
+            f'sky exec {name} echo hi',  # Should restart the timer.
+            'sleep 45',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout + BUMP_UP_SECONDS),
+        ],
+        f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
+    )
+    run_one_test(test)
+
+
+# ---------- Testing Autodowning ----------
+@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
+def test_autodown(generic_cloud: str):
+    name = get_cluster_name()
+    # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
+    # the VM is terminated.
+    autodown_timeout = 900 if generic_cloud == 'azure' else 240
+    total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
+    test = Test(
+        'autodown',
+        [
+            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
+            f'sky autostop -y {name} --down -i 1',
+            # Ensure autostop is set.
+            f'sky status | grep {name} | grep "1m (down)"',
+            # Ensure the cluster is not terminated early.
+            'sleep 40',
+            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
+            # Ensure the cluster is terminated.
+            f'sleep {autodown_timeout}',
+            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
+            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
+            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
+            f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
+            f'sky status | grep {name} | grep "1m (down)"',
+            f'sleep {autodown_timeout}',
+            # Ensure the cluster is terminated.
+            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
+            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
+            f'sky autostop -y {name} --cancel',
+            f'sleep {autodown_timeout}',
+            # Ensure the cluster is still UP.
+            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP',
+        ],
+        f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.scp
+def test_scp_autodown():
+    name = get_cluster_name()
+    test = Test(
+        'SCP_autodown',
+        [
+            f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
+            f'sky autostop -y {name} --down -i 1',
+            # Ensure autostop is set.
+            f'sky status | grep {name} | grep "1m (down)"',
+            # Ensure the cluster is not terminated early.
+            'sleep 45',
+            f'sky status --refresh | grep {name} | grep UP',
+            # Ensure the cluster is terminated.
+            'sleep 200',
+            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
+            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
+            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
+            f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
+            f'sky status | grep {name} | grep "1m (down)"',
+            'sleep 200',
+            # Ensure the cluster is terminated.
+            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
+            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
+            f'sky autostop -y {name} --cancel',
+            'sleep 200',
+            # Ensure the cluster is still UP.
+            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP',
+        ],
+        f'sky down -y {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60):
+    test = Test(
+        f'{cloud}-cancel-task',
+        [
+            f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d',
+            # Wait the GPU process to start.
+            'sleep 60',
+            f'sky exec {name} "nvidia-smi | grep python"',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky cancel -y {name} 1',
+            'sleep 60',
+            # check if the python job is gone.
+            f'sky exec {name} "! nvidia-smi | grep python"',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=timeout,
+    )
+    return test
+
+
+# ---------- Testing `sky cancel` ----------
+@pytest.mark.aws
+def test_cancel_aws():
+    name = get_cluster_name()
+    test = _get_cancel_task_with_cloud(name, 'aws')
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_cancel_gcp():
+    name = get_cluster_name()
+    test = _get_cancel_task_with_cloud(name, 'gcp')
+    run_one_test(test)
+
+
+@pytest.mark.azure
+def test_cancel_azure():
+    name = get_cluster_name()
+    test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60)
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack does not support V100 gpus for now
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
+@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
+@pytest.mark.no_paperspace  # Paperspace has `gnome-shell` on nvidia-smi
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
+def test_cancel_pytorch(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'cancel-pytorch',
+        [
+            f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d',
+            # Wait the GPU process to start.
+            'sleep 90',
+            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || '
+            # When run inside container/k8s, nvidia-smi cannot show process ids.
+            # See https://github.com/NVIDIA/nvidia-docker/issues/179
+            # To work around, we check if GPU utilization is greater than 0.
+            f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            f'sky cancel -y {name} 1',
+            'sleep 60',
+            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || '
+            # Ensure Xorg is the only process running.
+            '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"',
+            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi`
+# requires a CUDA public image, which IBM doesn't offer
+@pytest.mark.ibm
+def test_cancel_ibm():
+    name = get_cluster_name()
+    test = Test(
+        'ibm-cancel-task',
+        [
+            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml',
+            f'sky exec {name} -n {name}-1 -d  "while true; do echo \'Hello SkyPilot\'; sleep 2; done"',
+            'sleep 20',
+            f'sky queue {name} | grep {name}-1 | grep RUNNING',
+            f'sky cancel -y {name} 2',
+            f'sleep 5',
+            f'sky queue {name} | grep {name}-1 | grep CANCELLED',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Testing use-spot option ----------
+@pytest.mark.no_fluidstack  # FluidStack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
+def test_use_spot(generic_cloud: str):
+    """Test use-spot and sky exec."""
+    name = get_cluster_name()
+    test = Test(
+        'use-spot',
+        [
+            f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y',
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} echo hi',
+            f'sky logs {name} 2 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_stop_gcp_spot():
+    """Test GCP spot can be stopped, autostopped, restarted."""
+    name = get_cluster_name()
+    test = Test(
+        'stop_gcp_spot',
+        [
+            f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile',
+            # stop should go through:
+            f'sky stop {name} -y',
+            f'sky start {name} -y',
+            f'sky exec {name} -- ls myfile',
+            f'sky logs {name} 2 --status',
+            f'sky autostop {name} -i0 -y',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=90),
+            f'sky start {name} -y',
+            f'sky exec {name} -- ls myfile',
+            f'sky logs {name} 3 --status',
+            # -i option at launch should go through:
+            f'sky launch -c {name} -i0 -y',
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=120),
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+# ---------- Testing env ----------
+def test_inline_env(generic_cloud: str):
+    """Test env"""
+    name = get_cluster_name()
+    test = Test(
+        'test-inline-env',
+        [
+            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            'sleep 20',
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            f'sky logs {name} 2 --status',
+        ],
+        f'sky down -y {name}',
+        get_timeout(generic_cloud),
+    )
+    run_one_test(test)
+
+
+# ---------- Testing env file ----------
+def test_inline_env_file(generic_cloud: str):
+    """Test env"""
+    name = get_cluster_name()
+    test = Test(
+        'test-inline-env-file',
+        [
+            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            f'sky logs {name} 2 --status',
+        ],
+        f'sky down -y {name}',
+        get_timeout(generic_cloud),
+    )
+    run_one_test(test)
+
+
+# ---------- Testing custom image ----------
+@pytest.mark.aws
+def test_aws_custom_image():
+    """Test AWS custom image"""
+    name = get_cluster_name()
+    test = Test(
+        'test-aws-custom-image',
+        [
+            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a',  # Nvidia image
+            f'sky logs {name} 1 --status',
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.kubernetes
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
+        # Test latest image with python 3.11 installed by default.
+        'docker:continuumio/miniconda3:24.1.2-0',
+        # Test python>=3.12 where SkyPilot should automatically create a separate
+        # conda env for runtime with python 3.10.
+        'docker:continuumio/miniconda3:latest',
+    ])
+def test_kubernetes_custom_image(image_id):
+    """Test Kubernetes custom image"""
+    name = get_cluster_name()
+    test = Test(
+        'test-kubernetes-custom-image',
+        [
+            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1',
+            f'sky logs {name} 1 --status',
+            # Try exec to run again and check if the logs are printed
+            f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"',
+            # Make sure ssh is working with custom username
+            f'ssh {name} echo hi | grep hi',
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_start_stop_two_nodes():
+    name = get_cluster_name()
+    test = Test(
+        'azure-start-stop-two-nodes',
+        [
+            f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml',
+            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky stop -y {name}',
+            f'sky start -y {name} -i 1',
+            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
+            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
+            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})',
+                timeout=200 + BUMP_UP_SECONDS) +
+            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # 30 mins  (it takes around ~23 mins)
+    )
+    run_one_test(test)
+
+
+# ---------- Testing env for disk tier ----------
+@pytest.mark.aws
+def test_aws_disk_tier():
+
+    def _get_aws_query_command(region, instance_id, field, expected):
+        return (f'aws ec2 describe-volumes --region {region} '
+                f'--filters Name=attachment.instance-id,Values={instance_id} '
+                f'--query Volumes[*].{field} | grep {expected} ; ')
+
+    for disk_tier in list(resources_utils.DiskTier):
+        specs = AWS._get_disk_specs(disk_tier)
+        name = get_cluster_name() + '-' + disk_tier.value
+        name_on_cloud = common_utils.make_cluster_name_on_cloud(
+            name, sky.AWS.max_cluster_name_length())
+        region = 'us-east-2'
+        test = Test(
+            'aws-disk-tier-' + disk_tier.value,
+            [
+                f'sky launch -y -c {name} --cloud aws --region {region} '
+                f'--disk-tier {disk_tier.value} echo "hello sky"',
+                f'id=`aws ec2 describe-instances --region {region} --filters '
+                f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query '
+                f'Reservations[].Instances[].InstanceId --output text`; ' +
+                _get_aws_query_command(region, '$id', 'VolumeType',
+                                       specs['disk_tier']) +
+                ('' if specs['disk_tier']
+                 == 'standard' else _get_aws_query_command(
+                     region, '$id', 'Iops', specs['disk_iops'])) +
+                ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command(
+                    region, '$id', 'Throughput', specs['disk_throughput'])),
+            ],
+            f'sky down -y {name}',
+            timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
+        )
+        run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_disk_tier():
+    for disk_tier in list(resources_utils.DiskTier):
+        disk_types = [GCP._get_disk_type(disk_tier)]
+        name = get_cluster_name() + '-' + disk_tier.value
+        name_on_cloud = common_utils.make_cluster_name_on_cloud(
+            name, sky.GCP.max_cluster_name_length())
+        region = 'us-west2'
+        instance_type_options = ['']
+        if disk_tier == resources_utils.DiskTier.BEST:
+            # Ultra disk tier requires n2 instance types to have more than 64 CPUs.
+            # If using default instance type, it will only enable the high disk tier.
+            disk_types = [
+                GCP._get_disk_type(resources_utils.DiskTier.HIGH),
+                GCP._get_disk_type(resources_utils.DiskTier.ULTRA),
+            ]
+            instance_type_options = ['', '--instance-type n2-standard-64']
+        for disk_type, instance_type_option in zip(disk_types,
+                                                   instance_type_options):
+            test = Test(
+                'gcp-disk-tier-' + disk_tier.value,
+                [
+                    f'sky launch -y -c {name} --cloud gcp --region {region} '
+                    f'--disk-tier {disk_tier.value} {instance_type_option} ',
+                    f'name=`gcloud compute instances list --filter='
+                    f'"labels.ray-cluster-name:{name_on_cloud}" '
+                    '--format="value(name)"`; '
+                    f'gcloud compute disks list --filter="name=$name" '
+                    f'--format="value(type)" | grep {disk_type} '
+                ],
+                f'sky down -y {name}',
+                timeout=6 * 60,  # 6 mins  (it takes around ~3 mins)
+            )
+            run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_disk_tier():
+    for disk_tier in list(resources_utils.DiskTier):
+        if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
+            # Azure does not support high and ultra disk tier.
+            continue
+        type = Azure._get_disk_type(disk_tier)
+        name = get_cluster_name() + '-' + disk_tier.value
+        name_on_cloud = common_utils.make_cluster_name_on_cloud(
+            name, sky.Azure.max_cluster_name_length())
+        region = 'westus2'
+        test = Test(
+            'azure-disk-tier-' + disk_tier.value,
+            [
+                f'sky launch -y -c {name} --cloud azure --region {region} '
+                f'--disk-tier {disk_tier.value} echo "hello sky"',
+                f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
+                f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
+                f'--output tsv | grep {type}'
+            ],
+            f'sky down -y {name}',
+            timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
+        )
+        run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_best_tier_failover():
+    type = Azure._get_disk_type(resources_utils.DiskTier.LOW)
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.Azure.max_cluster_name_length())
+    region = 'westus2'
+    test = Test(
+        'azure-best-tier-failover',
+        [
+            f'sky launch -y -c {name} --cloud azure --region {region} '
+            f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"',
+            f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
+            f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
+            f'--output tsv | grep {type}',
+        ],
+        f'sky down -y {name}',
+        timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
+    )
+    run_one_test(test)
+
+
+# ------ Testing Zero Quota Failover ------
+@pytest.mark.aws
+def test_aws_zero_quota_failover():
+
+    name = get_cluster_name()
+    region = get_aws_region_for_quota_failover()
+
+    if not region:
+        pytest.xfail(
+            'Unable to test zero quota failover optimization — quotas '
+            'for EC2 P3 instances were found on all AWS regions. Is this '
+            'expected for your account?')
+        return
+
+    test = Test(
+        'aws-zero-quota-failover',
+        [
+            f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_zero_quota_failover():
+
+    name = get_cluster_name()
+    region = get_gcp_region_for_quota_failover()
+
+    if not region:
+        pytest.xfail(
+            'Unable to test zero quota failover optimization — quotas '
+            'for A100-80GB GPUs were found on all GCP regions. Is this '
+            'expected for your account?')
+        return
+
+    test = Test(
+        'gcp-zero-quota-failover',
+        [
+            f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
+
+
+def test_long_setup_run_script(generic_cloud: str):
+    name = get_cluster_name()
+    with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
+                                     suffix='.yaml') as f:
+        f.write(
+            textwrap.dedent(""" \
+            setup: |
+              echo "start long setup"
+            """))
+        for i in range(1024 * 200):
+            f.write(f'  echo {i}\n')
+        f.write('  echo "end long setup"\n')
+        f.write(
+            textwrap.dedent(""" \
+            run: |
+              echo "run"
+        """))
+        for i in range(1024 * 200):
+            f.write(f'  echo {i}\n')
+        f.write('  echo "end run"\n')
+        f.flush()
+
+        test = Test(
+            'long-setup-run-script',
+            [
+                f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}',
+                f'sky exec {name} "echo hello"',
+                f'sky exec {name} {f.name}',
+                f'sky logs {name} --status 1',
+                f'sky logs {name} --status 2',
+                f'sky logs {name} --status 3',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
index 96ce2f59c0c..e2e4c440b89 100644
--- a/tests/smoke_tests/test_images.py
+++ b/tests/smoke_tests/test_images.py
@@ -1,34 +1,28 @@
-# Smoke tests for SkyPilot
+# Smoke tests for SkyPilot for image functionality
 # Default options are set in pyproject.toml
 # Example usage:
 # Run all tests except for AWS and Lambda Cloud
-# > pytest tests/test_smoke.py
+# > pytest tests/smoke_tests/test_images.py
 #
 # Terminate failed clusters after test finishes
-# > pytest tests/test_smoke.py --terminate-on-failure
+# > pytest tests/smoke_tests/test_images.py --terminate-on-failure
 #
 # Re-run last failed tests
 # > pytest --lf
 #
 # Run one of the smoke tests
-# > pytest tests/test_smoke.py::test_minimal
-#
-# Only run managed job tests
-# > pytest tests/test_smoke.py --managed-jobs
-#
-# Only run sky serve tests
-# > pytest tests/test_smoke.py --sky-serve
+# > pytest tests/smoke_tests/test_images.py::test_aws_images
 #
 # Only run test for AWS + generic tests
-# > pytest tests/test_smoke.py --aws
+# > pytest tests/smoke_tests/test_images.py --aws
 #
 # Change cloud for generic tests to aws
-# > pytest tests/test_smoke.py --generic-cloud aws
+# > pytest tests/smoke_tests/test_images.py --generic-cloud aws
 
 import pytest
-from smoke_tests.util import _get_cluster_name
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
 from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
+from smoke_tests.util import get_cluster_name
 from smoke_tests.util import run_one_test
 from smoke_tests.util import Test
 
@@ -38,7 +32,7 @@
 # ---------- Test the image ----------
 @pytest.mark.aws
 def test_aws_images():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_images',
         [
@@ -58,7 +52,7 @@ def test_aws_images():
 
 @pytest.mark.gcp
 def test_gcp_images():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_images',
         [
@@ -78,7 +72,7 @@ def test_gcp_images():
 
 @pytest.mark.azure
 def test_azure_images():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'azure_images',
         [
@@ -98,7 +92,7 @@ def test_azure_images():
 
 @pytest.mark.aws
 def test_aws_image_id_dict():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_image_id_dict',
         [
@@ -117,7 +111,7 @@ def test_aws_image_id_dict():
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_image_id_dict',
         [
@@ -136,7 +130,7 @@ def test_gcp_image_id_dict():
 
 @pytest.mark.aws
 def test_aws_image_id_dict_region():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_image_id_dict_region',
         [
@@ -173,7 +167,7 @@ def test_aws_image_id_dict_region():
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict_region():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_image_id_dict_region',
         [
@@ -206,7 +200,7 @@ def test_gcp_image_id_dict_region():
 
 @pytest.mark.aws
 def test_aws_image_id_dict_zone():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_image_id_dict_zone',
         [
@@ -244,7 +238,7 @@ def test_aws_image_id_dict_zone():
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict_zone():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_image_id_dict_zone',
         [
@@ -278,7 +272,7 @@ def test_gcp_image_id_dict_zone():
 
 @pytest.mark.aws
 def test_clone_disk_aws():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'clone_disk_aws',
         [
@@ -305,7 +299,7 @@ def test_clone_disk_aws():
 
 @pytest.mark.gcp
 def test_clone_disk_gcp():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'clone_disk_gcp',
         [
@@ -324,7 +318,7 @@ def test_clone_disk_gcp():
 
 @pytest.mark.gcp
 def test_gcp_mig():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     region = 'us-central1'
     test = Test(
         'gcp_mig',
@@ -354,7 +348,7 @@ def test_gcp_mig():
 
 @pytest.mark.gcp
 def test_gcp_force_enable_external_ips():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test_commands = [
         f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
         # Check network of vm is "default"
@@ -376,7 +370,7 @@ def test_gcp_force_enable_external_ips():
 
 @pytest.mark.aws
 def test_image_no_conda():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'image_no_conda',
         [
@@ -396,7 +390,7 @@ def test_image_no_conda():
 @pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
 @pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
 def test_custom_default_conda_env(generic_cloud: str):
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test('custom_default_conda_env', [
         f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky status -r {name} | grep "UP"',
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
new file mode 100644
index 00000000000..521b08797f5
--- /dev/null
+++ b/tests/smoke_tests/test_managed_job.py
@@ -0,0 +1,766 @@
+# Smoke tests for SkyPilot for managed jobs
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_managed_job.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_managed_job.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs
+#
+# Only run managed job tests
+# > pytest tests/smoke_tests/test_managed_job.py --managed-jobs
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_managed_job.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_managed_job.py --generic-cloud aws
+
+import pathlib
+import tempfile
+import time
+
+import pytest
+from smoke_tests.util import _BUMP_UP_SECONDS
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import GET_JOB_QUEUE
+from smoke_tests.util import JOB_WAIT_NOT_RUNNING
+from smoke_tests.util import run_one_test
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
+from smoke_tests.util import TestStorageWithCredentials
+from smoke_tests.util import (
+    WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+
+from sky import jobs
+from sky.data import storage as storage_lib
+from sky.jobs.state import ManagedJobStatus
+from sky.skylet import constants
+from sky.utils import common_utils
+
+
+# ---------- Testing managed job ----------
+# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues
+# when the controller being on Azure, which takes a long time for launching
+# step.
+@pytest.mark.managed_jobs
+def test_managed_jobs(generic_cloud: str):
+    """Test the managed jobs yaml."""
+    name = get_cluster_name()
+    test = Test(
+        'managed-jobs',
+        [
+            f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
+            f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-1',
+                job_status=
+                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
+                timeout=60),
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=
+                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
+                timeout=60),
+            f'sky jobs cancel -y -n {name}-1',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-1',
+                job_status=f'{ManagedJobStatus.CANCELLED.value}',
+                timeout=230),
+            # Test the functionality for logging.
+            f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
+            f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
+            f'{GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
+        ],
+        # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when
+        # canceling multiple job names is supported.
+        f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
+@pytest.mark.managed_jobs
+def test_job_pipeline(generic_cloud: str):
+    """Test a job pipeline."""
+    name = get_cluster_name()
+    test = Test(
+        'spot-pipeline',
+        [
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
+            'sleep 5',
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
+            # `grep -A 4 {name}` finds the job with {name} and the 4 lines
+            # after it, i.e. the 4 tasks within the job.
+            # `sed -n 2p` gets the second line of the 4 lines, i.e. the first
+            # task within the job.
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
+            f'sky jobs cancel -y -n {name}',
+            'sleep 5',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
+            'sleep 200',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
+@pytest.mark.managed_jobs
+def test_managed_jobs_failed_setup(generic_cloud: str):
+    """Test managed job with failed setup."""
+    name = get_cluster_name()
+    test = Test(
+        'managed_jobs_failed_setup',
+        [
+            f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
+            # Make sure the job failed quickly.
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
+                timeout=330 + _BUMP_UP_SECONDS),
+        ],
+        f'sky jobs cancel -y -n {name}',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
+    """Test managed job with failed setup for a pipeline."""
+    name = get_cluster_name()
+    test = Test(
+        'managed_jobs_pipeline_failed_setup',
+        [
+            f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
+                timeout=600),
+            # Make sure the job failed quickly.
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            # Task 0 should be SUCCEEDED.
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
+            # Task 1 should be FAILED_SETUP.
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
+            # Task 2 should be CANCELLED.
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            # Task 3 should be CANCELLED.
+            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+# ---------- Testing managed job recovery ----------
+
+
+@pytest.mark.aws
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_aws(aws_config_region):
+    """Test managed job recovery."""
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    region = aws_config_region
+    test = Test(
+        'managed_jobs_recovery_aws',
+        [
+            f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=600),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            # Terminate the cluster manually.
+            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
+             f'aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
+             f'--query Reservations[].Instances[].InstanceId '
+             '--output text)'),
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_gcp():
+    """Test managed job recovery."""
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    zone = 'us-east4-b'
+    query_cmd = (
+        f'gcloud compute instances list --filter='
+        # `:` means prefix match.
+        f'"(labels.ray-cluster-name:{name_on_cloud})" '
+        f'--zones={zone} --format="value(name)"')
+    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
+                     f' --quiet $({query_cmd})')
+    test = Test(
+        'managed_jobs_recovery_gcp',
+        [
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            # Terminate the cluster manually.
+            terminate_cmd,
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
+    """Test managed job recovery for a pipeline."""
+    name = get_cluster_name()
+    user_hash = common_utils.get_user_hash()
+    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
+    region = aws_config_region
+    if region != 'us-east-2':
+        pytest.skip('Only run spot pipeline recovery test in us-east-2')
+    test = Test(
+        'managed_jobs_pipeline_recovery_aws',
+        [
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
+            # Terminate the cluster manually.
+            # The `cat ...| rev` is to retrieve the job_id from the
+            # SKYPILOT_TASK_ID, which gets the second to last field
+            # separated by `-`.
+            (
+                f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
+                'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
+                f'aws ec2 terminate-instances --region {region} --instance-ids $('
+                f'aws ec2 describe-instances --region {region} '
+                # TODO(zhwu): fix the name for spot cluster.
+                '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}'
+                f'-{user_hash} '
+                f'--query Reservations[].Instances[].InstanceId '
+                '--output text)'),
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
+            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
+            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_recovery_gcp():
+    """Test managed job recovery for a pipeline."""
+    name = get_cluster_name()
+    zone = 'us-east4-b'
+    user_hash = common_utils.get_user_hash()
+    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
+    query_cmd = (
+        'gcloud compute instances list --filter='
+        f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" '
+        f'--zones={zone} --format="value(name)"')
+    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
+                     f' --quiet $({query_cmd})')
+    test = Test(
+        'managed_jobs_pipeline_recovery_gcp',
+        [
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
+            # Terminate the cluster manually.
+            # The `cat ...| rev` is to retrieve the job_id from the
+            # SKYPILOT_TASK_ID, which gets the second to last field
+            # separated by `-`.
+            (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
+             f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=200),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
+            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
+            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_default_resources(generic_cloud: str):
+    """Test managed job recovery for default resources."""
+    name = get_cluster_name()
+    test = Test(
+        'managed-spot-recovery-default-resources',
+        [
+            f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})',
+                timeout=360),
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
+    """Test managed job recovery."""
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    region = aws_config_region
+    test = Test(
+        'managed_jobs_recovery_multi_node_aws',
+        [
+            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=450),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            # Terminate the worker manually.
+            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
+             f'aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
+             'Name=tag:ray-node-type,Values=worker '
+             f'--query Reservations[].Instances[].InstanceId '
+             '--output text)'),
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=560),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_multi_node_gcp():
+    """Test managed job recovery."""
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    zone = 'us-west2-a'
+    # Use ':' to match as the cluster name will contain the suffix with job id
+    query_cmd = (
+        f'gcloud compute instances list --filter='
+        f'"(labels.ray-cluster-name:{name_on_cloud} AND '
+        f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"')
+    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
+                     f' --quiet $({query_cmd})')
+    test = Test(
+        'managed_jobs_recovery_multi_node_gcp',
+        [
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=400),
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            # Terminate the worker manually.
+            terminate_cmd,
+            JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=560),
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        timeout=25 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.aws
+@pytest.mark.managed_jobs
+def test_managed_jobs_cancellation_aws(aws_config_region):
+    name = get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    name_2_on_cloud = common_utils.make_cluster_name_on_cloud(
+        f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
+        f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    region = aws_config_region
+    test = Test(
+        'managed_jobs_cancellation_aws',
+        [
+            # Test cancellation during spot cluster being launched.
+            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})',
+                timeout=60 + _BUMP_UP_SECONDS),
+            f'sky jobs cancel -y -n {name}',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            (f's=$(aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
+             f'--query Reservations[].Instances[].State[].Name '
+             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
+            ),
+            # Test cancelling the spot cluster during spot job being setup.
+            f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
+            # The job is set up in the cluster, will shown as RUNNING.
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
+            f'sky jobs cancel -y -n {name}-2',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            (f's=$(aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
+             f'--query Reservations[].Instances[].State[].Name '
+             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
+            ),
+            # Test cancellation during spot job is recovering.
+            f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
+            # The job is running in the cluster, will shown as RUNNING.
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
+            # Terminate the cluster manually.
+            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
+             f'aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
+             f'--query Reservations[].Instances[].InstanceId '
+             '--output text)'),
+            JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
+            f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
+            f'sky jobs cancel -y -n {name}-3',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
+            # there can be multiple VM with the same name due to the recovery.
+            (f's=$(aws ec2 describe-instances --region {region} '
+             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
+             f'--query Reservations[].Instances[].State[].Name '
+             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"'
+            ),
+        ],
+        timeout=25 * 60)
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.managed_jobs
+def test_managed_jobs_cancellation_gcp():
+    name = get_cluster_name()
+    name_3 = f'{name}-3'
+    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+    zone = 'us-west3-b'
+    query_state_cmd = (
+        'gcloud compute instances list '
+        f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" '
+        '--format="value(status)"')
+    query_cmd = (f'gcloud compute instances list --filter='
+                 f'"(labels.ray-cluster-name:{name_3_on_cloud})" '
+                 f'--zones={zone} --format="value(name)"')
+    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
+                     f' --quiet $({query_cmd})')
+    test = Test(
+        'managed_jobs_cancellation_gcp',
+        [
+            # Test cancellation during spot cluster being launched.
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.STARTING.value,
+                timeout=60 + _BUMP_UP_SECONDS),
+            f'sky jobs cancel -y -n {name}',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            # Test cancelling the spot cluster during spot job being setup.
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
+            # The job is set up in the cluster, will shown as RUNNING.
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
+            f'sky jobs cancel -y -n {name}-2',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            # Test cancellation during spot job is recovering.
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.RUNNING.value,
+                timeout=300 + _BUMP_UP_SECONDS),
+            # Terminate the cluster manually.
+            terminate_cmd,
+            JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
+            f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
+            f'sky jobs cancel -y -n {name}-3',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-3',
+                job_status=ManagedJobStatus.CANCELLED.value,
+                timeout=120 + _BUMP_UP_SECONDS),
+            # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
+            # there can be multiple VM with the same name due to the recovery.
+            (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
+            ),
+        ],
+        timeout=25 * 60)
+    run_one_test(test)
+
+
+# ---------- Testing storage for managed job ----------
+@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
+@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
+@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
+@pytest.mark.no_paperspace  # Paperspace does not support spot instances
+@pytest.mark.no_scp  # SCP does not support spot instances
+@pytest.mark.managed_jobs
+def test_managed_jobs_storage(generic_cloud: str):
+    """Test storage with managed job"""
+    name = get_cluster_name()
+    yaml_str = pathlib.Path(
+        'examples/managed_job_with_storage.yaml').read_text()
+    timestamp = int(time.time())
+    storage_name = f'sky-test-{timestamp}'
+    output_storage_name = f'sky-test-output-{timestamp}'
+
+    # Also perform region testing for bucket creation to validate if buckets are
+    # created in the correct region and correctly mounted in managed jobs.
+    # However, we inject this testing only for AWS and GCP since they are the
+    # supported object storage providers in SkyPilot.
+    region_flag = ''
+    region_validation_cmd = 'true'
+    use_spot = ' --use-spot'
+    if generic_cloud == 'aws':
+        region = 'eu-central-1'
+        region_flag = f' --region {region}'
+        region_cmd = TestStorageWithCredentials.cli_region_cmd(
+            storage_lib.StoreType.S3, bucket_name=storage_name)
+        region_validation_cmd = f'{region_cmd} | grep {region}'
+        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
+            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
+        output_check_cmd = f'{s3_check_file_count} | grep 1'
+    elif generic_cloud == 'gcp':
+        region = 'us-west2'
+        region_flag = f' --region {region}'
+        region_cmd = TestStorageWithCredentials.cli_region_cmd(
+            storage_lib.StoreType.GCS, bucket_name=storage_name)
+        region_validation_cmd = f'{region_cmd} | grep {region}'
+        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
+            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
+        output_check_cmd = f'{gcs_check_file_count} | grep 1'
+    elif generic_cloud == 'azure':
+        region = 'westus2'
+        region_flag = f' --region {region}'
+        storage_account_name = (
+            storage_lib.AzureBlobStore.get_default_storage_account_name(region))
+        region_cmd = TestStorageWithCredentials.cli_region_cmd(
+            storage_lib.StoreType.AZURE,
+            storage_account_name=storage_account_name)
+        region_validation_cmd = f'{region_cmd} | grep {region}'
+        az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
+            storage_lib.StoreType.AZURE,
+            output_storage_name,
+            'output.txt',
+            storage_account_name=storage_account_name)
+        output_check_cmd = f'{az_check_file_count} | grep 1'
+    elif generic_cloud == 'kubernetes':
+        # With Kubernetes, we don't know which object storage provider is used.
+        # Check both S3 and GCS if bucket exists in either.
+        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
+            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
+        s3_output_check_cmd = f'{s3_check_file_count} | grep 1'
+        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
+            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
+        gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1'
+        output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}'
+        use_spot = ' --no-use-spot'
+
+    yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name)
+    yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(yaml_str)
+        f.flush()
+        file_path = f.name
+        test = Test(
+            'managed_jobs_storage',
+            [
+                *STORAGE_SETUP_COMMANDS,
+                f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
+                region_validation_cmd,  # Check if the bucket is created in the correct region
+                WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                    job_name=name,
+                    job_status=ManagedJobStatus.SUCCEEDED.value,
+                    timeout=60 + _BUMP_UP_SECONDS),
+                f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
+                # Check if file was written to the mounted output bucket
+                output_check_cmd
+            ],
+            (f'sky jobs cancel -y -n {name}',
+             f'; sky storage delete {output_storage_name} || true'),
+            # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+            timeout=20 * 60,
+        )
+        run_one_test(test)
+
+
+# ---------- Testing spot TPU ----------
+@pytest.mark.gcp
+@pytest.mark.managed_jobs
+@pytest.mark.tpu
+def test_managed_jobs_tpu():
+    """Test managed job on TPU."""
+    name = get_cluster_name()
+    test = Test(
+        'test-spot-tpu',
+        [
+            f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.STARTING.value,
+                timeout=60 + _BUMP_UP_SECONDS),
+            # TPU takes a while to launch
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=
+                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})',
+                timeout=900 + _BUMP_UP_SECONDS),
+        ],
+        f'sky jobs cancel -y -n {name}',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# ---------- Testing env for managed jobs ----------
+@pytest.mark.managed_jobs
+def test_managed_jobs_inline_env(generic_cloud: str):
+    """Test managed jobs env"""
+    name = get_cluster_name()
+    test = Test(
+        'test-managed-jobs-inline-env',
+        [
+            f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=ManagedJobStatus.SUCCEEDED.value,
+                timeout=20 + _BUMP_UP_SECONDS),
+        ],
+        f'sky jobs cancel -y -n {name}',
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
+        timeout=20 * 60,
+    )
+    run_one_test(test)
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
new file mode 100644
index 00000000000..95952d3b432
--- /dev/null
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -0,0 +1,1503 @@
+# Smoke tests for SkyPilot for mounting storage
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_mount_and_storage.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_mount_and_storage.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_mount_and_storage.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_mount_and_storage.py --generic-cloud aws
+
+import os
+import pathlib
+import shlex
+import shutil
+import subprocess
+import tempfile
+import time
+from typing import Dict, Optional
+import urllib.parse
+import uuid
+
+import jinja2
+import pytest
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_timeout
+from smoke_tests.util import run_one_test
+from smoke_tests.util import SCP_TYPE
+from smoke_tests.util import STORAGE_SETUP_COMMANDS
+from smoke_tests.util import Test
+from smoke_tests.util import TestStorageWithCredentials
+
+import sky
+from sky import global_user_state
+from sky import skypilot_config
+from sky.adaptors import cloudflare
+from sky.adaptors import ibm
+from sky.data import data_utils
+from sky.data import storage as storage_lib
+from sky.data.data_utils import Rclone
+
+
+# ---------- file_mounts ----------
+@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead.
+def test_file_mounts(generic_cloud: str):
+    name = get_cluster_name()
+    extra_flags = ''
+    if generic_cloud in 'kubernetes':
+        # Kubernetes does not support multi-node
+        # NOTE: This test will fail if you have a Kubernetes cluster running on
+        #  arm64 (e.g., Apple Silicon) since goofys does not work on arm64.
+        extra_flags = '--num-nodes 1'
+    test_commands = [
+        *STORAGE_SETUP_COMMANDS,
+        f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml',
+        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+    ]
+    test = Test(
+        'using_file_mounts',
+        test_commands,
+        f'sky down -y {name}',
+        get_timeout(generic_cloud, 20 * 60),  # 20 mins
+    )
+    run_one_test(test)
+
+
+@pytest.mark.scp
+def test_scp_file_mounts():
+    name = get_cluster_name()
+    test_commands = [
+        *STORAGE_SETUP_COMMANDS,
+        f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml',
+        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+    ]
+    test = Test(
+        'SCP_using_file_mounts',
+        test_commands,
+        f'sky down -y {name}',
+        timeout=20 * 60,  # 20 mins
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Requires GCP to be enabled
+def test_using_file_mounts_with_env_vars(generic_cloud: str):
+    name = get_cluster_name()
+    storage_name = TestStorageWithCredentials.generate_bucket_name()
+    test_commands = [
+        *STORAGE_SETUP_COMMANDS,
+        (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} '
+         'examples/using_file_mounts_with_env_vars.yaml '
+         f'--env MY_BUCKET={storage_name}'),
+        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        # Override with --env:
+        (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} '
+         'examples/using_file_mounts_with_env_vars.yaml '
+         f'--env MY_BUCKET={storage_name} '
+         '--env MY_LOCAL_PATH=tmpfile'),
+        f'sky logs {name}-2 1 --status',  # Ensure the job succeeded.
+    ]
+    test = Test(
+        'using_file_mounts_with_env_vars',
+        test_commands,
+        (f'sky down -y {name} {name}-2',
+         f'sky storage delete -y {storage_name} {storage_name}-2'),
+        timeout=20 * 60,  # 20 mins
+    )
+    run_one_test(test)
+
+
+# ---------- storage ----------
+@pytest.mark.aws
+def test_aws_storage_mounts_with_stop():
+    name = get_cluster_name()
+    cloud = 'aws'
+    storage_name = f'sky-test-{int(time.time())}'
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name, cloud=cloud)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'aws s3 ls {storage_name}/hello.txt',
+            f'sky stop -y {name}',
+            f'sky start -y {name}',
+            # Check if hello.txt from mounting bucket exists after restart in
+            # the mounted directory
+            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
+        ]
+        test = Test(
+            'aws_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_gcp_storage_mounts_with_stop():
+    name = get_cluster_name()
+    cloud = 'gcp'
+    storage_name = f'sky-test-{int(time.time())}'
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name, cloud=cloud)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'gsutil ls gs://{storage_name}/hello.txt',
+            f'sky stop -y {name}',
+            f'sky start -y {name}',
+            # Check if hello.txt from mounting bucket exists after restart in
+            # the mounted directory
+            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
+        ]
+        test = Test(
+            'gcp_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.azure
+def test_azure_storage_mounts_with_stop():
+    name = get_cluster_name()
+    cloud = 'azure'
+    storage_name = f'sky-test-{int(time.time())}'
+    default_region = 'eastus'
+    storage_account_name = (storage_lib.AzureBlobStore.
+                            get_default_storage_account_name(default_region))
+    storage_account_key = data_utils.get_az_storage_account_key(
+        storage_account_name)
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name, cloud=cloud)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)'
+            # if the file does not exist, az storage blob list returns '[]'
+            f'[ "$output" = "[]" ] && exit 1;'
+            f'sky stop -y {name}',
+            f'sky start -y {name}',
+            # Check if hello.txt from mounting bucket exists after restart in
+            # the mounted directory
+            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
+        ]
+        test = Test(
+            'azure_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_kubernetes_storage_mounts():
+    # Tests bucket mounting on k8s, assuming S3 is configured.
+    # This test will fail if run on non x86_64 architecture, since goofys is
+    # built for x86_64 only.
+    name = get_cluster_name()
+    storage_name = f'sky-test-{int(time.time())}'
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud kubernetes {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'aws s3 ls {storage_name}/hello.txt || '
+            f'gsutil ls gs://{storage_name}/hello.txt',
+        ]
+        test = Test(
+            'kubernetes_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_kubernetes_context_switch():
+    name = get_cluster_name()
+    new_context = f'sky-test-context-{int(time.time())}'
+    new_namespace = f'sky-test-namespace-{int(time.time())}'
+
+    test_commands = [
+        # Launch a cluster and run a simple task
+        f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"',
+        f'sky logs {name} 1 --status',  # Ensure job succeeded
+
+        # Get current context details and save to a file for later use in cleanup
+        'CURRENT_CONTEXT=$(kubectl config current-context); '
+        'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; '
+        'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); '
+        'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); '
+
+        # Create a new context with a different name and namespace
+        f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}',
+
+        # Create the new namespace if it doesn't exist
+        f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -',
+
+        # Set the new context as active
+        f'kubectl config use-context {new_context}',
+
+        # Verify the new context is active
+        f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1',
+
+        # Try to run sky exec on the original cluster (should still work)
+        f'sky exec {name} "echo Success: sky exec works after context switch"',
+
+        # Test sky queue
+        f'sky queue {name}',
+
+        # Test SSH access
+        f'ssh {name} whoami',
+    ]
+
+    cleanup_commands = (
+        f'kubectl delete namespace {new_namespace}; '
+        f'kubectl config delete-context {new_context}; '
+        'kubectl config use-context $(cat /tmp/sky_test_current_context); '
+        'rm /tmp/sky_test_current_context; '
+        f'sky down -y {name}')
+
+    test = Test(
+        'kubernetes_context_switch',
+        test_commands,
+        cleanup_commands,
+        timeout=20 * 60,  # 20 mins
+    )
+    run_one_test(test)
+
+
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
+        # Test image with python 3.11 installed by default.
+        'docker:continuumio/miniconda3:24.1.2-0',
+        # Test python>=3.12 where SkyPilot should automatically create a separate
+        # conda env for runtime with python 3.10.
+        'docker:continuumio/miniconda3:latest',
+    ])
+def test_docker_storage_mounts(generic_cloud: str, image_id: str):
+    # Tests bucket mounting on docker container
+    name = get_cluster_name()
+    timestamp = str(time.time()).replace('.', '')
+    storage_name = f'sky-test-{timestamp}'
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3.
+    azure_mount_unsupported_ubuntu_version = '18.04'
+    # Commands to verify bucket upload. We need to check all three
+    # storage types because the optimizer may pick any of them.
+    s3_command = f'aws s3 ls {storage_name}/hello.txt'
+    gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt'
+    azure_blob_command = TestStorageWithCredentials.cli_ls_cmd(
+        storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt')
+    if azure_mount_unsupported_ubuntu_version in image_id:
+        # The store for mount_private_mount is not specified in the template.
+        # If we're running on Azure, the private mount will be created on
+        # azure blob. That will not be supported on the ubuntu 18.04 image
+        # and thus fail. For other clouds, the private mount on other
+        # storage types (GCS/S3) should succeed.
+        include_private_mount = False if generic_cloud == 'azure' else True
+        content = template.render(storage_name=storage_name,
+                                  include_azure_mount=False,
+                                  include_private_mount=include_private_mount)
+    else:
+        content = template.render(storage_name=storage_name,)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            # Check AWS, GCP, or Azure storage mount.
+            f'{s3_command} || '
+            f'{gsutil_command} || '
+            f'{azure_blob_command}',
+        ]
+        test = Test(
+            'docker_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.cloudflare
+def test_cloudflare_storage_mounts(generic_cloud: str):
+    name = get_cluster_name()
+    storage_name = f'sky-test-{int(time.time())}'
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_r2_storage_mounting.yaml').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name)
+    endpoint_url = cloudflare.create_endpoint()
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2'
+        ]
+
+        test = Test(
+            'cloudflare_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.ibm
+def test_ibm_storage_mounts():
+    name = get_cluster_name()
+    storage_name = f'sky-test-{int(time.time())}'
+    bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
+        storage_name, Rclone.RcloneClouds.IBM)
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(storage_name=storage_name)
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        f.write(content)
+        f.flush()
+        file_path = f.name
+        test_commands = [
+            *STORAGE_SETUP_COMMANDS,
+            f'sky launch -y -c {name} --cloud ibm {file_path}',
+            f'sky logs {name} 1 --status',  # Ensure job succeeded.
+            f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt',
+        ]
+        test = Test(
+            'ibm_storage_mounts',
+            test_commands,
+            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+# ---------- Testing Storage ----------
+class TestStorageWithCredentials:
+    """Storage tests which require credentials and network connection"""
+
+    AWS_INVALID_NAMES = [
+        'ab',  # less than 3 characters
+        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
+        # more than 63 characters
+        'Abcdef',  # contains an uppercase letter
+        'abc def',  # contains a space
+        'abc..def',  # two adjacent periods
+        '192.168.5.4',  # formatted as an IP address
+        'xn--bucket',  # starts with 'xn--' prefix
+        'bucket-s3alias',  # ends with '-s3alias' suffix
+        'bucket--ol-s3',  # ends with '--ol-s3' suffix
+        '.abc',  # starts with a dot
+        'abc.',  # ends with a dot
+        '-abc',  # starts with a hyphen
+        'abc-',  # ends with a hyphen
+    ]
+
+    GCS_INVALID_NAMES = [
+        'ab',  # less than 3 characters
+        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
+        # more than 63 characters (without dots)
+        'Abcdef',  # contains an uppercase letter
+        'abc def',  # contains a space
+        'abc..def',  # two adjacent periods
+        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1'
+        # More than 63 characters between dots
+        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5,
+        # more than 222 characters (with dots)
+        '192.168.5.4',  # formatted as an IP address
+        'googbucket',  # starts with 'goog' prefix
+        'googlebucket',  # contains 'google'
+        'g00glebucket',  # variant of 'google'
+        'go0glebucket',  # variant of 'google'
+        'g0oglebucket',  # variant of 'google'
+        '.abc',  # starts with a dot
+        'abc.',  # ends with a dot
+        '_abc',  # starts with an underscore
+        'abc_',  # ends with an underscore
+    ]
+
+    AZURE_INVALID_NAMES = [
+        'ab',  # less than 3 characters
+        # more than 63 characters
+        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
+        'Abcdef',  # contains an uppercase letter
+        '.abc',  # starts with a non-letter(dot)
+        'a--bc',  # contains consecutive hyphens
+    ]
+
+    IBM_INVALID_NAMES = [
+        'ab',  # less than 3 characters
+        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
+        # more than 63 characters
+        'Abcdef',  # contains an uppercase letter
+        'abc def',  # contains a space
+        'abc..def',  # two adjacent periods
+        '192.168.5.4',  # formatted as an IP address
+        'xn--bucket',  # starts with 'xn--' prefix
+        '.abc',  # starts with a dot
+        'abc.',  # ends with a dot
+        '-abc',  # starts with a hyphen
+        'abc-',  # ends with a hyphen
+        'a.-bc',  # contains the sequence '.-'
+        'a-.bc',  # contains the sequence '-.'
+        'a&bc'  # contains special characters
+        'ab^c'  # contains special characters
+    ]
+    GITIGNORE_SYNC_TEST_DIR_STRUCTURE = {
+        'double_asterisk': {
+            'double_asterisk_excluded': None,
+            'double_asterisk_excluded_dir': {
+                'dir_excluded': None,
+            },
+        },
+        'double_asterisk_parent': {
+            'parent': {
+                'also_excluded.txt': None,
+                'child': {
+                    'double_asterisk_parent_child_excluded.txt': None,
+                },
+                'double_asterisk_parent_excluded.txt': None,
+            },
+        },
+        'excluded.log': None,
+        'excluded_dir': {
+            'excluded.txt': None,
+            'nested_excluded': {
+                'excluded': None,
+            },
+        },
+        'exp-1': {
+            'be_excluded': None,
+        },
+        'exp-2': {
+            'be_excluded': None,
+        },
+        'front_slash_excluded': None,
+        'included.log': None,
+        'included.txt': None,
+        'include_dir': {
+            'excluded.log': None,
+            'included.log': None,
+        },
+        'nested_double_asterisk': {
+            'one': {
+                'also_exclude.txt': None,
+            },
+            'two': {
+                'also_exclude.txt': None,
+            },
+        },
+        'nested_wildcard_dir': {
+            'monday': {
+                'also_exclude.txt': None,
+            },
+            'tuesday': {
+                'also_exclude.txt': None,
+            },
+        },
+        'no_slash_excluded': None,
+        'no_slash_tests': {
+            'no_slash_excluded': {
+                'also_excluded.txt': None,
+            },
+        },
+        'question_mark': {
+            'excluded1.txt': None,
+            'excluded@.txt': None,
+        },
+        'square_bracket': {
+            'excluded1.txt': None,
+        },
+        'square_bracket_alpha': {
+            'excludedz.txt': None,
+        },
+        'square_bracket_excla': {
+            'excluded2.txt': None,
+            'excluded@.txt': None,
+        },
+        'square_bracket_single': {
+            'excluded0.txt': None,
+        },
+    }
+
+    @staticmethod
+    def create_dir_structure(base_path, structure):
+        # creates a given file STRUCTURE in BASE_PATH
+        for name, substructure in structure.items():
+            path = os.path.join(base_path, name)
+            if substructure is None:
+                # Create a file
+                open(path, 'a', encoding='utf-8').close()
+            else:
+                # Create a subdirectory
+                os.mkdir(path)
+                TestStorageWithCredentials.create_dir_structure(
+                    path, substructure)
+
+    @staticmethod
+    def cli_delete_cmd(store_type,
+                       bucket_name,
+                       storage_account_name: str = None):
+        if store_type == storage_lib.StoreType.S3:
+            url = f's3://{bucket_name}'
+            return f'aws s3 rb {url} --force'
+        if store_type == storage_lib.StoreType.GCS:
+            url = f'gs://{bucket_name}'
+            gsutil_alias, alias_gen = data_utils.get_gsutil_command()
+            return f'{alias_gen}; {gsutil_alias} rm -r {url}'
+        if store_type == storage_lib.StoreType.AZURE:
+            default_region = 'eastus'
+            storage_account_name = (
+                storage_lib.AzureBlobStore.get_default_storage_account_name(
+                    default_region))
+            storage_account_key = data_utils.get_az_storage_account_key(
+                storage_account_name)
+            return ('az storage container delete '
+                    f'--account-name {storage_account_name} '
+                    f'--account-key {storage_account_key} '
+                    f'--name {bucket_name}')
+        if store_type == storage_lib.StoreType.R2:
+            endpoint_url = cloudflare.create_endpoint()
+            url = f's3://{bucket_name}'
+            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2'
+        if store_type == storage_lib.StoreType.IBM:
+            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
+                bucket_name, Rclone.RcloneClouds.IBM)
+            return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}'
+
+    @staticmethod
+    def cli_ls_cmd(store_type, bucket_name, suffix=''):
+        if store_type == storage_lib.StoreType.S3:
+            if suffix:
+                url = f's3://{bucket_name}/{suffix}'
+            else:
+                url = f's3://{bucket_name}'
+            return f'aws s3 ls {url}'
+        if store_type == storage_lib.StoreType.GCS:
+            if suffix:
+                url = f'gs://{bucket_name}/{suffix}'
+            else:
+                url = f'gs://{bucket_name}'
+            return f'gsutil ls {url}'
+        if store_type == storage_lib.StoreType.AZURE:
+            default_region = 'eastus'
+            config_storage_account = skypilot_config.get_nested(
+                ('azure', 'storage_account'), None)
+            storage_account_name = config_storage_account if (
+                config_storage_account is not None) else (
+                    storage_lib.AzureBlobStore.get_default_storage_account_name(
+                        default_region))
+            storage_account_key = data_utils.get_az_storage_account_key(
+                storage_account_name)
+            list_cmd = ('az storage blob list '
+                        f'--container-name {bucket_name} '
+                        f'--prefix {shlex.quote(suffix)} '
+                        f'--account-name {storage_account_name} '
+                        f'--account-key {storage_account_key}')
+            return list_cmd
+        if store_type == storage_lib.StoreType.R2:
+            endpoint_url = cloudflare.create_endpoint()
+            if suffix:
+                url = f's3://{bucket_name}/{suffix}'
+            else:
+                url = f's3://{bucket_name}'
+            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2'
+        if store_type == storage_lib.StoreType.IBM:
+            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
+                bucket_name, Rclone.RcloneClouds.IBM)
+            return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}'
+
+    @staticmethod
+    def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None):
+        if store_type == storage_lib.StoreType.S3:
+            assert bucket_name is not None
+            return ('aws s3api get-bucket-location '
+                    f'--bucket {bucket_name} --output text')
+        elif store_type == storage_lib.StoreType.GCS:
+            assert bucket_name is not None
+            return (f'gsutil ls -L -b gs://{bucket_name}/ | '
+                    'grep "Location constraint" | '
+                    'awk \'{print tolower($NF)}\'')
+        elif store_type == storage_lib.StoreType.AZURE:
+            # For Azure Blob Storage, the location of the containers are
+            # determined by the location of storage accounts.
+            assert storage_account_name is not None
+            return (f'az storage account show --name {storage_account_name} '
+                    '--query "primaryLocation" --output tsv')
+        else:
+            raise NotImplementedError(f'Region command not implemented for '
+                                      f'{store_type}')
+
+    @staticmethod
+    def cli_count_name_in_bucket(store_type,
+                                 bucket_name,
+                                 file_name,
+                                 suffix='',
+                                 storage_account_name=None):
+        if store_type == storage_lib.StoreType.S3:
+            if suffix:
+                return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
+            else:
+                return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
+        elif store_type == storage_lib.StoreType.GCS:
+            if suffix:
+                return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l'
+            else:
+                return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l'
+        elif store_type == storage_lib.StoreType.AZURE:
+            if storage_account_name is None:
+                default_region = 'eastus'
+                storage_account_name = (
+                    storage_lib.AzureBlobStore.get_default_storage_account_name(
+                        default_region))
+            storage_account_key = data_utils.get_az_storage_account_key(
+                storage_account_name)
+            return ('az storage blob list '
+                    f'--container-name {bucket_name} '
+                    f'--prefix {shlex.quote(suffix)} '
+                    f'--account-name {storage_account_name} '
+                    f'--account-key {storage_account_key} | '
+                    f'grep {file_name} | '
+                    'wc -l')
+        elif store_type == storage_lib.StoreType.R2:
+            endpoint_url = cloudflare.create_endpoint()
+            if suffix:
+                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
+            else:
+                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
+
+    @staticmethod
+    def cli_count_file_in_bucket(store_type, bucket_name):
+        if store_type == storage_lib.StoreType.S3:
+            return f'aws s3 ls s3://{bucket_name} --recursive | wc -l'
+        elif store_type == storage_lib.StoreType.GCS:
+            return f'gsutil ls -r gs://{bucket_name}/** | wc -l'
+        elif store_type == storage_lib.StoreType.AZURE:
+            default_region = 'eastus'
+            storage_account_name = (
+                storage_lib.AzureBlobStore.get_default_storage_account_name(
+                    default_region))
+            storage_account_key = data_utils.get_az_storage_account_key(
+                storage_account_name)
+            return ('az storage blob list '
+                    f'--container-name {bucket_name} '
+                    f'--account-name {storage_account_name} '
+                    f'--account-key {storage_account_key} | '
+                    'grep \\"name\\": | '
+                    'wc -l')
+        elif store_type == storage_lib.StoreType.R2:
+            endpoint_url = cloudflare.create_endpoint()
+            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l'
+
+    @pytest.fixture
+    def tmp_source(self, tmp_path):
+        # Creates a temporary directory with a file in it
+        tmp_dir = tmp_path / 'tmp-source'
+        tmp_dir.mkdir()
+        tmp_file = tmp_dir / 'tmp-file'
+        tmp_file.write_text('test')
+        circle_link = tmp_dir / 'circle-link'
+        circle_link.symlink_to(tmp_dir, target_is_directory=True)
+        yield str(tmp_dir)
+
+    @staticmethod
+    def generate_bucket_name():
+        # Creates a temporary bucket name
+        # time.time() returns varying precision on different systems, so we
+        # replace the decimal point and use whatever precision we can get.
+        timestamp = str(time.time()).replace('.', '')
+        return f'sky-test-{timestamp}'
+
+    @pytest.fixture
+    def tmp_bucket_name(self):
+        yield self.generate_bucket_name()
+
+    @staticmethod
+    def yield_storage_object(
+            name: Optional[str] = None,
+            source: Optional[storage_lib.Path] = None,
+            stores: Optional[Dict[storage_lib.StoreType,
+                                  storage_lib.AbstractStore]] = None,
+            persistent: Optional[bool] = True,
+            mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT):
+        # Creates a temporary storage object. Stores must be added in the test.
+        storage_obj = storage_lib.Storage(name=name,
+                                          source=source,
+                                          stores=stores,
+                                          persistent=persistent,
+                                          mode=mode)
+        yield storage_obj
+        handle = global_user_state.get_handle_from_storage_name(
+            storage_obj.name)
+        if handle:
+            # If handle exists, delete manually
+            # TODO(romilb): This is potentially risky - if the delete method has
+            #   bugs, this can cause resource leaks. Ideally we should manually
+            #   eject storage from global_user_state and delete the bucket using
+            #   boto3 directly.
+            storage_obj.delete()
+
+    @pytest.fixture
+    def tmp_scratch_storage_obj(self, tmp_bucket_name):
+        # Creates a storage object with no source to create a scratch storage.
+        # Stores must be added in the test.
+        yield from self.yield_storage_object(name=tmp_bucket_name)
+
+    @pytest.fixture
+    def tmp_multiple_scratch_storage_obj(self):
+        # Creates a list of 5 storage objects with no source to create
+        # multiple scratch storages.
+        # Stores for each object in the list must be added in the test.
+        storage_mult_obj = []
+        for _ in range(5):
+            timestamp = str(time.time()).replace('.', '')
+            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}')
+            storage_mult_obj.append(store_obj)
+        yield storage_mult_obj
+        for storage_obj in storage_mult_obj:
+            handle = global_user_state.get_handle_from_storage_name(
+                storage_obj.name)
+            if handle:
+                # If handle exists, delete manually
+                # TODO(romilb): This is potentially risky - if the delete method has
+                # bugs, this can cause resource leaks. Ideally we should manually
+                # eject storage from global_user_state and delete the bucket using
+                # boto3 directly.
+                storage_obj.delete()
+
+    @pytest.fixture
+    def tmp_multiple_custom_source_storage_obj(self):
+        # Creates a list of storage objects with custom source names to
+        # create multiple scratch storages.
+        # Stores for each object in the list must be added in the test.
+        custom_source_names = ['"path With Spaces"', 'path With Spaces']
+        storage_mult_obj = []
+        for name in custom_source_names:
+            src_path = os.path.expanduser(f'~/{name}')
+            pathlib.Path(src_path).expanduser().mkdir(exist_ok=True)
+            timestamp = str(time.time()).replace('.', '')
+            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}',
+                                            source=src_path)
+            storage_mult_obj.append(store_obj)
+        yield storage_mult_obj
+        for storage_obj in storage_mult_obj:
+            handle = global_user_state.get_handle_from_storage_name(
+                storage_obj.name)
+            if handle:
+                storage_obj.delete()
+
+    @pytest.fixture
+    def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source):
+        # Creates a temporary storage object. Stores must be added in the test.
+        yield from self.yield_storage_object(name=tmp_bucket_name,
+                                             source=tmp_source)
+
+    @pytest.fixture
+    def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source):
+        # Creates a temp storage object which uses a list of paths as source.
+        # Stores must be added in the test. After upload, the bucket should
+        # have two files - /tmp-file and /tmp-source/tmp-file
+        list_source = [tmp_source, tmp_source + '/tmp-file']
+        yield from self.yield_storage_object(name=tmp_bucket_name,
+                                             source=list_source)
+
+    @pytest.fixture
+    def tmp_bulk_del_storage_obj(self, tmp_bucket_name):
+        # Creates a temporary storage object for testing bulk deletion.
+        # Stores must be added in the test.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}',
+                                    shell=True)
+            subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt',
+                                    shell=True)
+            subprocess.check_output(
+                f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True)
+            yield from self.yield_storage_object(name=tmp_bucket_name,
+                                                 source=tmpdir)
+
+    @pytest.fixture
+    def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj):
+        # Creates a copy mount storage which reuses an existing storage object.
+        tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3)
+        storage_name = tmp_scratch_storage_obj.name
+
+        # Try to initialize another storage with the storage object created
+        # above, but now in COPY mode. This should succeed.
+        yield from self.yield_storage_object(name=storage_name,
+                                             mode=storage_lib.StorageMode.COPY)
+
+    @pytest.fixture
+    def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure):
+        # Creates a temporary storage object for testing .gitignore filter.
+        # GITIGINORE_STRUCTURE is representing a file structure in a dictionary
+        # format. Created storage object will contain the file structure along
+        # with .gitignore and .git/info/exclude files to test exclude filter.
+        # Stores must be added in the test.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Creates file structure to be uploaded in the Storage
+            self.create_dir_structure(tmpdir, gitignore_structure)
+
+            # Create .gitignore and list files/dirs to be excluded in it
+            skypilot_path = os.path.dirname(os.path.dirname(sky.__file__))
+            temp_path = f'{tmpdir}/.gitignore'
+            file_path = os.path.join(skypilot_path, 'tests/gitignore_test')
+            shutil.copyfile(file_path, temp_path)
+
+            # Create .git/info/exclude and list files/dirs to be excluded in it
+            temp_path = f'{tmpdir}/.git/info/'
+            os.makedirs(temp_path)
+            temp_exclude_path = os.path.join(temp_path, 'exclude')
+            file_path = os.path.join(skypilot_path,
+                                     'tests/git_info_exclude_test')
+            shutil.copyfile(file_path, temp_exclude_path)
+
+            # Create sky Storage with the files created
+            yield from self.yield_storage_object(
+                name=tmp_bucket_name,
+                source=tmpdir,
+                mode=storage_lib.StorageMode.COPY)
+
+    @pytest.fixture
+    def tmp_awscli_bucket(self, tmp_bucket_name):
+        # Creates a temporary bucket using awscli
+        bucket_uri = f's3://{tmp_bucket_name}'
+        subprocess.check_call(['aws', 's3', 'mb', bucket_uri])
+        yield tmp_bucket_name, bucket_uri
+        subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force'])
+
+    @pytest.fixture
+    def tmp_gsutil_bucket(self, tmp_bucket_name):
+        # Creates a temporary bucket using gsutil
+        bucket_uri = f'gs://{tmp_bucket_name}'
+        subprocess.check_call(['gsutil', 'mb', bucket_uri])
+        yield tmp_bucket_name, bucket_uri
+        subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri])
+
+    @pytest.fixture
+    def tmp_az_bucket(self, tmp_bucket_name):
+        # Creates a temporary bucket using gsutil
+        default_region = 'eastus'
+        storage_account_name = (
+            storage_lib.AzureBlobStore.get_default_storage_account_name(
+                default_region))
+        storage_account_key = data_utils.get_az_storage_account_key(
+            storage_account_name)
+        bucket_uri = data_utils.AZURE_CONTAINER_URL.format(
+            storage_account_name=storage_account_name,
+            container_name=tmp_bucket_name)
+        subprocess.check_call([
+            'az', 'storage', 'container', 'create', '--name',
+            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
+            '--account-key', f'{storage_account_key}'
+        ])
+        yield tmp_bucket_name, bucket_uri
+        subprocess.check_call([
+            'az', 'storage', 'container', 'delete', '--name',
+            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
+            '--account-key', f'{storage_account_key}'
+        ])
+
+    @pytest.fixture
+    def tmp_awscli_bucket_r2(self, tmp_bucket_name):
+        # Creates a temporary bucket using awscli
+        endpoint_url = cloudflare.create_endpoint()
+        bucket_uri = f's3://{tmp_bucket_name}'
+        subprocess.check_call(
+            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2',
+            shell=True)
+        yield tmp_bucket_name, bucket_uri
+        subprocess.check_call(
+            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2',
+            shell=True)
+
+    @pytest.fixture
+    def tmp_ibm_cos_bucket(self, tmp_bucket_name):
+        # Creates a temporary bucket using IBM COS API
+        storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name)
+        yield tmp_bucket_name
+        storage_obj.delete()
+
+    @pytest.fixture
+    def tmp_public_storage_obj(self, request):
+        # Initializes a storage object with a public bucket
+        storage_obj = storage_lib.Storage(source=request.param)
+        yield storage_obj
+        # This does not require any deletion logic because it is a public bucket
+        # and should not get added to global_user_state.
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
+    ])
+    def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj,
+                                              store_type):
+        # Creates a new bucket with a local source, uploads files to it
+        # and deletes it.
+        tmp_local_storage_obj.add_store(store_type)
+
+        # Run sky storage ls to check if storage object exists in the output
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_local_storage_obj.name in out.decode('utf-8')
+
+        # Run sky storage delete to delete the storage object
+        subprocess.check_output(
+            ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes'])
+
+        # Run sky storage ls to check if storage object is deleted
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_local_storage_obj.name not in out.decode('utf-8')
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.xdist_group('multiple_bucket_deletion')
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm)
+    ])
+    def test_multiple_buckets_creation_and_deletion(
+            self, tmp_multiple_scratch_storage_obj, store_type):
+        # Creates multiple new buckets(5 buckets) with a local source
+        # and deletes them.
+        storage_obj_name = []
+        for store_obj in tmp_multiple_scratch_storage_obj:
+            store_obj.add_store(store_type)
+            storage_obj_name.append(store_obj.name)
+
+        # Run sky storage ls to check if all storage objects exists in the
+        # output filtered by store type
+        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
+        out = [
+            item.split()[0]
+            for item in out_all.decode('utf-8').splitlines()
+            if store_type.value in item
+        ]
+        assert all([item in out for item in storage_obj_name])
+
+        # Run sky storage delete all to delete all storage objects
+        delete_cmd = ['sky', 'storage', 'delete', '--yes']
+        delete_cmd += storage_obj_name
+        subprocess.check_output(delete_cmd)
+
+        # Run sky storage ls to check if all storage objects filtered by store
+        # type are deleted
+        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
+        out = [
+            item.split()[0]
+            for item in out_all.decode('utf-8').splitlines()
+            if store_type.value in item
+        ]
+        assert all([item not in out for item in storage_obj_name])
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
+    ])
+    def test_upload_source_with_spaces(self, store_type,
+                                       tmp_multiple_custom_source_storage_obj):
+        # Creates two buckets with specified local sources
+        # with spaces in the name
+        storage_obj_names = []
+        for storage_obj in tmp_multiple_custom_source_storage_obj:
+            storage_obj.add_store(store_type)
+            storage_obj_names.append(storage_obj.name)
+
+        # Run sky storage ls to check if all storage objects exists in the
+        # output filtered by store type
+        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
+        out = [
+            item.split()[0]
+            for item in out_all.decode('utf-8').splitlines()
+            if store_type.value in item
+        ]
+        assert all([item in out for item in storage_obj_names])
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
+    ])
+    def test_bucket_external_deletion(self, tmp_scratch_storage_obj,
+                                      store_type):
+        # Creates a bucket, deletes it externally using cloud cli commands
+        # and then tries to delete it using sky storage delete.
+        tmp_scratch_storage_obj.add_store(store_type)
+
+        # Run sky storage ls to check if storage object exists in the output
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_scratch_storage_obj.name in out.decode('utf-8')
+
+        # Delete bucket externally
+        cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name)
+        subprocess.check_output(cmd, shell=True)
+
+        # Run sky storage delete to delete the storage object
+        out = subprocess.check_output(
+            ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes'])
+        # Make sure bucket was not created during deletion (see issue #1322)
+        assert 'created' not in out.decode('utf-8').lower()
+
+        # Run sky storage ls to check if storage object is deleted
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_scratch_storage_obj.name not in out.decode('utf-8')
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
+    ])
+    def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj):
+        # Creates a temp folder with over 256 files and folders, upload
+        # files and folders to a new bucket, then delete bucket.
+        tmp_bulk_del_storage_obj.add_store(store_type)
+
+        subprocess.check_output([
+            'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes'
+        ])
+
+        output = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8')
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize(
+        'tmp_public_storage_obj, store_type',
+        [('s3://tcga-2-open', storage_lib.StoreType.S3),
+         ('s3://digitalcorpora', storage_lib.StoreType.S3),
+         ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS),
+         pytest.param(
+             'https://azureopendatastorage.blob.core.windows.net/nyctlc',
+             storage_lib.StoreType.AZURE,
+             marks=pytest.mark.azure)],
+        indirect=['tmp_public_storage_obj'])
+    def test_public_bucket(self, tmp_public_storage_obj, store_type):
+        # Creates a new bucket with a public source and verifies that it is not
+        # added to global_user_state.
+        tmp_public_storage_obj.add_store(store_type)
+
+        # Run sky storage ls to check if storage object exists in the output
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert tmp_public_storage_obj.name not in out.decode('utf-8')
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize(
+        'nonexist_bucket_url',
+        [
+            's3://{random_name}',
+            'gs://{random_name}',
+            pytest.param(
+                'https://{account_name}.blob.core.windows.net/{random_name}',  # pylint: disable=line-too-long
+                marks=pytest.mark.azure),
+            pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm),
+            pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare)
+        ])
+    def test_nonexistent_bucket(self, nonexist_bucket_url):
+        # Attempts to create fetch a stroage with a non-existent source.
+        # Generate a random bucket name and verify it doesn't exist:
+        retry_count = 0
+        while True:
+            nonexist_bucket_name = str(uuid.uuid4())
+            if nonexist_bucket_url.startswith('s3'):
+                command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}'
+                expected_output = '404'
+            elif nonexist_bucket_url.startswith('gs'):
+                command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}'
+                expected_output = 'BucketNotFoundException'
+            elif nonexist_bucket_url.startswith('https'):
+                default_region = 'eastus'
+                storage_account_name = (
+                    storage_lib.AzureBlobStore.get_default_storage_account_name(
+                        default_region))
+                storage_account_key = data_utils.get_az_storage_account_key(
+                    storage_account_name)
+                command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}'
+                expected_output = '"exists": false'
+            elif nonexist_bucket_url.startswith('r2'):
+                endpoint_url = cloudflare.create_endpoint()
+                command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2'
+                expected_output = '404'
+            elif nonexist_bucket_url.startswith('cos'):
+                # Using API calls, since using rclone requires a profile's name
+                try:
+                    expected_output = command = "echo"  # avoid unrelated exception in case of failure.
+                    bucket_name = urllib.parse.urlsplit(
+                        nonexist_bucket_url.format(
+                            random_name=nonexist_bucket_name)).path.strip('/')
+                    client = ibm.get_cos_client('us-east')
+                    client.head_bucket(Bucket=bucket_name)
+                except ibm.ibm_botocore.exceptions.ClientError as e:
+                    if e.response['Error']['Code'] == '404':
+                        # success
+                        return
+            else:
+                raise ValueError('Unsupported bucket type '
+                                 f'{nonexist_bucket_url}')
+
+            # Check if bucket exists using the cli:
+            try:
+                out = subprocess.check_output(command,
+                                              stderr=subprocess.STDOUT,
+                                              shell=True)
+            except subprocess.CalledProcessError as e:
+                out = e.output
+            out = out.decode('utf-8')
+            if expected_output in out:
+                break
+            else:
+                retry_count += 1
+                if retry_count > 3:
+                    raise RuntimeError('Unable to find a nonexistent bucket '
+                                       'to use. This is higly unlikely - '
+                                       'check if the tests are correct.')
+
+        with pytest.raises(sky.exceptions.StorageBucketGetError,
+                           match='Attempted to use a non-existent'):
+            if nonexist_bucket_url.startswith('https'):
+                storage_obj = storage_lib.Storage(
+                    source=nonexist_bucket_url.format(
+                        account_name=storage_account_name,
+                        random_name=nonexist_bucket_name))
+            else:
+                storage_obj = storage_lib.Storage(
+                    source=nonexist_bucket_url.format(
+                        random_name=nonexist_bucket_name))
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize(
+        'private_bucket',
+        [
+            f's3://imagenet',
+            f'gs://imagenet',
+            pytest.param('https://smoketestprivate.blob.core.windows.net/test',
+                         marks=pytest.mark.azure),  # pylint: disable=line-too-long
+            pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm)
+        ])
+    def test_private_bucket(self, private_bucket):
+        # Attempts to access private buckets not belonging to the user.
+        # These buckets are known to be private, but may need to be updated if
+        # they are removed by their owners.
+        store_type = urllib.parse.urlsplit(private_bucket).scheme
+        if store_type == 'https' or store_type == 'cos':
+            private_bucket_name = urllib.parse.urlsplit(
+                private_bucket).path.strip('/')
+        else:
+            private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc
+        with pytest.raises(
+                sky.exceptions.StorageBucketGetError,
+                match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format(
+                    name=private_bucket_name)):
+            storage_obj = storage_lib.Storage(source=private_bucket)
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
+                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
+                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
+                              pytest.param('tmp_az_bucket',
+                                           storage_lib.StoreType.AZURE,
+                                           marks=pytest.mark.azure),
+                              pytest.param('tmp_ibm_cos_bucket',
+                                           storage_lib.StoreType.IBM,
+                                           marks=pytest.mark.ibm),
+                              pytest.param('tmp_awscli_bucket_r2',
+                                           storage_lib.StoreType.R2,
+                                           marks=pytest.mark.cloudflare)])
+    def test_upload_to_existing_bucket(self, ext_bucket_fixture, request,
+                                       tmp_source, store_type):
+        # Tries uploading existing files to newly created bucket (outside of
+        # sky) and verifies that files are written.
+        bucket_name, _ = request.getfixturevalue(ext_bucket_fixture)
+        storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source)
+        storage_obj.add_store(store_type)
+
+        # Check if tmp_source/tmp-file exists in the bucket using aws cli
+        out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name),
+                                      shell=True)
+        assert 'tmp-file' in out.decode('utf-8'), \
+            'File not found in bucket - output was : {}'.format(out.decode
+                                                                ('utf-8'))
+
+        # Check symlinks - symlinks don't get copied by sky storage
+        assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), (
+            'circle-link was not found in the upload source - '
+            'are the test fixtures correct?')
+        assert 'circle-link' not in out.decode('utf-8'), (
+            'Symlink found in bucket - ls output was : {}'.format(
+                out.decode('utf-8')))
+
+        # Run sky storage ls to check if storage object exists in the output.
+        # It should not exist because the bucket was created externally.
+        out = subprocess.check_output(['sky', 'storage', 'ls'])
+        assert storage_obj.name not in out.decode('utf-8')
+
+    @pytest.mark.no_fluidstack
+    def test_copy_mount_existing_storage(self,
+                                         tmp_copy_mnt_existing_storage_obj):
+        # Creates a bucket with no source in MOUNT mode (empty bucket), and
+        # then tries to load the same storage in COPY mode.
+        tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3)
+        storage_name = tmp_copy_mnt_existing_storage_obj.name
+
+        # Check `sky storage ls` to ensure storage object exists
+        out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8')
+        assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.'
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('store_type', [
+        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
+        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
+        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
+        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
+    ])
+    def test_list_source(self, tmp_local_list_storage_obj, store_type):
+        # Uses a list in the source field to specify a file and a directory to
+        # be uploaded to the storage object.
+        tmp_local_list_storage_obj.add_store(store_type)
+
+        # Check if tmp-file exists in the bucket root using cli
+        out = subprocess.check_output(self.cli_ls_cmd(
+            store_type, tmp_local_list_storage_obj.name),
+                                      shell=True)
+        assert 'tmp-file' in out.decode('utf-8'), \
+            'File not found in bucket - output was : {}'.format(out.decode
+                                                                ('utf-8'))
+
+        # Check if tmp-file exists in the bucket/tmp-source using cli
+        out = subprocess.check_output(self.cli_ls_cmd(
+            store_type, tmp_local_list_storage_obj.name, 'tmp-source/'),
+                                      shell=True)
+        assert 'tmp-file' in out.decode('utf-8'), \
+            'File not found in bucket - output was : {}'.format(out.decode
+                                                                ('utf-8'))
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('invalid_name_list, store_type',
+                             [(AWS_INVALID_NAMES, storage_lib.StoreType.S3),
+                              (GCS_INVALID_NAMES, storage_lib.StoreType.GCS),
+                              pytest.param(AZURE_INVALID_NAMES,
+                                           storage_lib.StoreType.AZURE,
+                                           marks=pytest.mark.azure),
+                              pytest.param(IBM_INVALID_NAMES,
+                                           storage_lib.StoreType.IBM,
+                                           marks=pytest.mark.ibm),
+                              pytest.param(AWS_INVALID_NAMES,
+                                           storage_lib.StoreType.R2,
+                                           marks=pytest.mark.cloudflare)])
+    def test_invalid_names(self, invalid_name_list, store_type):
+        # Uses a list in the source field to specify a file and a directory to
+        # be uploaded to the storage object.
+        for name in invalid_name_list:
+            with pytest.raises(sky.exceptions.StorageNameError):
+                storage_obj = storage_lib.Storage(name=name)
+                storage_obj.add_store(store_type)
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize(
+        'gitignore_structure, store_type',
+        [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3),
+         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS),
+         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE),
+         pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE,
+                      storage_lib.StoreType.R2,
+                      marks=pytest.mark.cloudflare)])
+    def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure,
+                                                     store_type,
+                                                     tmp_gitignore_storage_obj):
+        # tests if files included in .gitignore and .git/info/exclude are
+        # excluded from being transferred to Storage
+
+        tmp_gitignore_storage_obj.add_store(store_type)
+
+        upload_file_name = 'included'
+        # Count the number of files with the given file name
+        up_cmd = self.cli_count_name_in_bucket(store_type, \
+            tmp_gitignore_storage_obj.name, file_name=upload_file_name)
+        git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \
+            tmp_gitignore_storage_obj.name, file_name='.git')
+        cnt_num_file_cmd = self.cli_count_file_in_bucket(
+            store_type, tmp_gitignore_storage_obj.name)
+
+        up_output = subprocess.check_output(up_cmd, shell=True)
+        git_exclude_output = subprocess.check_output(git_exclude_cmd,
+                                                     shell=True)
+        cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True)
+
+        assert '3' in up_output.decode('utf-8'), \
+                'Files to be included are not completely uploaded.'
+        # 1 is read as .gitignore is uploaded
+        assert '1' in git_exclude_output.decode('utf-8'), \
+               '.git directory should not be uploaded.'
+        # 4 files include .gitignore, included.log, included.txt, include_dir/included.log
+        assert '4' in cnt_output.decode('utf-8'), \
+               'Some items listed in .gitignore and .git/info/exclude are not excluded.'
+
+    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
+                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
+                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
+                              pytest.param('tmp_awscli_bucket_r2',
+                                           storage_lib.StoreType.R2,
+                                           marks=pytest.mark.cloudflare)])
+    def test_externally_created_bucket_mount_without_source(
+            self, ext_bucket_fixture, request, store_type):
+        # Non-sky managed buckets(buckets created outside of Skypilot CLI)
+        # are allowed to be MOUNTed by specifying the URI of the bucket to
+        # source field only. When it is attempted by specifying the name of
+        # the bucket only, it should error out.
+        #
+        # TODO(doyoung): Add test for IBM COS. Currently, this is blocked
+        # as rclone used to interact with IBM COS does not support feature to
+        # create a bucket, and the ibmcloud CLI is not supported in Skypilot.
+        # Either of the feature is necessary to simulate an external bucket
+        # creation for IBM COS.
+        # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837
+
+        ext_bucket_name, ext_bucket_uri = request.getfixturevalue(
+            ext_bucket_fixture)
+        # invalid spec
+        with pytest.raises(sky.exceptions.StorageSpecError) as e:
+            storage_obj = storage_lib.Storage(
+                name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT)
+            storage_obj.add_store(store_type)
+
+        assert 'Attempted to mount a non-sky managed bucket' in str(e)
+
+        # valid spec
+        storage_obj = storage_lib.Storage(source=ext_bucket_uri,
+                                          mode=storage_lib.StorageMode.MOUNT)
+        handle = global_user_state.get_handle_from_storage_name(
+            storage_obj.name)
+        if handle:
+            storage_obj.delete()
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('region', [
+        'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1',
+        'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1',
+        'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1',
+        'us-east-2', 'us-west-1', 'us-west-2'
+    ])
+    def test_aws_regions(self, tmp_local_storage_obj, region):
+        # This tests creation and upload to bucket in all AWS s3 regions
+        # To test full functionality, use test_managed_jobs_storage above.
+        store_type = storage_lib.StoreType.S3
+        tmp_local_storage_obj.add_store(store_type, region=region)
+        bucket_name = tmp_local_storage_obj.name
+
+        # Confirm that the bucket was created in the correct region
+        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
+        out = subprocess.check_output(region_cmd, shell=True)
+        output = out.decode('utf-8')
+        expected_output_region = region
+        if region == 'us-east-1':
+            expected_output_region = 'None'  # us-east-1 is the default region
+        assert expected_output_region in out.decode('utf-8'), (
+            f'Bucket was not found in region {region} - '
+            f'output of {region_cmd} was: {output}')
+
+        # Check if tmp_source/tmp-file exists in the bucket using cli
+        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
+        out = subprocess.check_output(ls_cmd, shell=True)
+        output = out.decode('utf-8')
+        assert 'tmp-file' in output, (
+            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
+
+    @pytest.mark.no_fluidstack
+    @pytest.mark.parametrize('region', [
+        'northamerica-northeast1', 'northamerica-northeast2', 'us-central1',
+        'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2',
+        'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1',
+        'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1',
+        'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6',
+        'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12',
+        'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2',
+        'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2',
+        'asia-southeast2', 'me-central1', 'me-central2', 'me-west1',
+        'australia-southeast1', 'australia-southeast2', 'africa-south1'
+    ])
+    def test_gcs_regions(self, tmp_local_storage_obj, region):
+        # This tests creation and upload to bucket in all GCS regions
+        # To test full functionality, use test_managed_jobs_storage above.
+        store_type = storage_lib.StoreType.GCS
+        tmp_local_storage_obj.add_store(store_type, region=region)
+        bucket_name = tmp_local_storage_obj.name
+
+        # Confirm that the bucket was created in the correct region
+        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
+        out = subprocess.check_output(region_cmd, shell=True)
+        output = out.decode('utf-8')
+        assert region in out.decode('utf-8'), (
+            f'Bucket was not found in region {region} - '
+            f'output of {region_cmd} was: {output}')
+
+        # Check if tmp_source/tmp-file exists in the bucket using cli
+        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
+        out = subprocess.check_output(ls_cmd, shell=True)
+        output = out.decode('utf-8')
+        assert 'tmp-file' in output, (
+            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index 0fc7ce409fc..3000c82068d 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -1,56 +1,34 @@
-# Smoke tests for SkyPilot
+# Smoke tests for SkyPilot for reg
 # Default options are set in pyproject.toml
 # Example usage:
 # Run all tests except for AWS and Lambda Cloud
-# > pytest tests/test_smoke.py
+# > pytest tests/smoke_tests/test_region_and_zone.py
 #
 # Terminate failed clusters after test finishes
-# > pytest tests/test_smoke.py --terminate-on-failure
+# > pytest tests/smoke_tests/test_region_and_zone.py --terminate-on-failure
 #
 # Re-run last failed tests
 # > pytest --lf
 #
 # Run one of the smoke tests
-# > pytest tests/test_smoke.py::test_minimal
-#
-# Only run managed job tests
-# > pytest tests/test_smoke.py --managed-jobs
-#
-# Only run sky serve tests
-# > pytest tests/test_smoke.py --sky-serve
+# > pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region
 #
 # Only run test for AWS + generic tests
-# > pytest tests/test_smoke.py --aws
+# > pytest tests/smoke_tests/test_region_and_zone.py --aws
 #
 # Change cloud for generic tests to aws
-# > pytest tests/test_smoke.py --generic-cloud aws
-
-import enum
-import inspect
-import json
-import os
-import pathlib
-import shlex
-import shutil
-import subprocess
-import sys
+# > pytest tests/smoke_tests/test_region_and_zone.py --generic-cloud aws
+
 import tempfile
 import textwrap
-import time
-from typing import Dict, List, NamedTuple, Optional, Tuple
-import urllib.parse
-import uuid
 
-import colorama
-import jinja2
 import pytest
-from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import (
-    _get_cmd_wait_until_cluster_status_contains_wildcard)
-from smoke_tests.util import (
-    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard
 from smoke_tests.util import run_one_test
 from smoke_tests.util import Test
+from smoke_tests.util import (
+    WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
 
 from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
@@ -60,7 +38,7 @@
 # ---------- Test region ----------
 @pytest.mark.aws
 def test_aws_region():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_region',
         [
@@ -81,7 +59,7 @@ def test_aws_region():
 
 @pytest.mark.aws
 def test_aws_with_ssh_proxy_command():
-    name = _get_cluster_name()
+    name = get_cluster_name()
 
     with tempfile.NamedTemporaryFile(mode='w') as f:
         f.write(
@@ -104,13 +82,12 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
-                _get_cmd_wait_until_cluster_status_contains_wildcard(
+                get_cmd_wait_until_cluster_status_contains_wildcard(
                     cluster_name_wildcard='sky-jobs-controller-*',
                     cluster_status=ClusterStatus.UP.value,
                     timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
-                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
-                format(
+                WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
                     job_name=name,
                     job_status=
                     f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})',
@@ -123,7 +100,7 @@ def test_aws_with_ssh_proxy_command():
 
 @pytest.mark.gcp
 def test_gcp_region_and_service_account():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_region',
         [
@@ -146,7 +123,7 @@ def test_gcp_region_and_service_account():
 
 @pytest.mark.ibm
 def test_ibm_region():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     region = 'eu-de'
     test = Test(
         'region',
@@ -163,7 +140,7 @@ def test_ibm_region():
 
 @pytest.mark.azure
 def test_azure_region():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'azure_region',
         [
@@ -187,7 +164,7 @@ def test_azure_region():
 # ---------- Test zone ----------
 @pytest.mark.aws
 def test_aws_zone():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'aws_zone',
         [
@@ -203,7 +180,7 @@ def test_aws_zone():
 
 @pytest.mark.ibm
 def test_ibm_zone():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     zone = 'eu-de-2'
     test = Test(
         'zone',
@@ -220,7 +197,7 @@ def test_ibm_zone():
 
 @pytest.mark.gcp
 def test_gcp_zone():
-    name = _get_cluster_name()
+    name = get_cluster_name()
     test = Test(
         'gcp_zone',
         [
diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py
new file mode 100644
index 00000000000..dd368718821
--- /dev/null
+++ b/tests/smoke_tests/test_required_before_merge.py
@@ -0,0 +1,46 @@
+# Smoke tests for SkyPilot required before merging
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_required_before_merge.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_required_before_merge.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_required_before_merge.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws
+
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import run_one_test
+from smoke_tests.util import Test
+from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
+
+from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+
+
+def test_yaml_launch_and_mount(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'test_yaml_launch_and_mount',
+        [
+            f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml',
+            WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+                cluster_name=name,
+                job_id=1,
+                job_status=JobStatus.SUCCEEDED.value,
+                timeout=2 * 60),
+        ],
+        f'sky down -y {name}',
+        timeout=5 * 60,
+    )
+    run_one_test(test)
diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py
new file mode 100644
index 00000000000..f56d9bb96ee
--- /dev/null
+++ b/tests/smoke_tests/test_sky_serve.py
@@ -0,0 +1,795 @@
+# Smoke tests for SkyPilot for sky serve
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_sky_serve.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_sky_serve.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http
+#
+# Only run sky serve tests
+# > pytest tests/smoke_tests/test_sky_server.py --sky-serve
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_sky_serve.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_sky_serve.py --generic-cloud aws
+
+import inspect
+import json
+import shlex
+from typing import List, Tuple
+
+import pytest
+from smoke_tests.util import get_cluster_name
+from smoke_tests.util import run_one_test
+from smoke_tests.util import terminate_gcp_replica
+from smoke_tests.util import Test
+from smoke_tests.util import test_id
+
+from sky import serve
+from sky.utils import common_utils
+
+# ---------- Testing skyserve ----------
+
+
+def _get_service_name() -> str:
+    """Returns a user-unique service name for each test_skyserve_<name>().
+
+    Must be called from each test_skyserve_<name>().
+    """
+    caller_func_name = inspect.stack()[1][3]
+    test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
+    test_name = test_name.replace('skyserve-', 'ss-')
+    test_name = common_utils.make_cluster_name_on_cloud(test_name, 24)
+    return f'{test_name}-{test_id}'
+
+
+# We check the output of the skyserve service to see if it is ready. Output of
+# `REPLICAS` is in the form of `1/2` where the first number is the number of
+# ready replicas and the second number is the number of total replicas. We
+# grep such format to ensure that the service is ready, and early exit if any
+# failure detected. In the end we sleep for
+# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have
+# enough time to sync with the controller and get all ready replica IPs.
+_SERVE_WAIT_UNTIL_READY = (
+    '{{ while true; do'
+    '     s=$(sky serve status {name}); echo "$s";'
+    '     echo "$s" | grep -q "{replica_num}/{replica_num}" && break;'
+    '     echo "$s" | grep -q "FAILED" && exit 1;'
+    '     sleep 10;'
+    ' done; }}; echo "Got service status $s";'
+    f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};')
+_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}'
+_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag'
+_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING'
+# Since we don't allow terminate the service if the controller is INIT,
+# which is common for simultaneous pytest, we need to wait until the
+# controller is UP before we can terminate the service.
+# The teardown command has a 10-mins timeout, so we don't need to do
+# the timeout here. See implementation of run_one_test() for details.
+_TEARDOWN_SERVICE = (
+    '(for i in `seq 1 20`; do'
+    '     s=$(sky serve down -y {name});'
+    '     echo "Trying to terminate {name}";'
+    '     echo "$s";'
+    '     echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;'
+    '     sleep 10;'
+    '     [ $i -eq 20 ] && echo "Failed to terminate service {name}";'
+    'done)')
+
+_SERVE_ENDPOINT_WAIT = (
+    'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
+    'endpoint=$(sky serve status --endpoint {name}); '
+    'until ! echo "$endpoint" | grep "Controller is initializing"; '
+    'do echo "Waiting for serve endpoint to be ready..."; '
+    'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; '
+    'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"')
+
+_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); '
+                      'until ! echo "$s" | grep "Controller is initializing."; '
+                      'do echo "Waiting for serve status to be ready..."; '
+                      'sleep 5; s=$(sky serve status {name}); done; echo "$s"')
+
+
+def _get_replica_ip(name: str, replica_id: int) -> str:
+    return (f'ip{replica_id}=$(echo "$s" | '
+            f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | '
+            f'grep -E "{name}\s+{replica_id}" | '
+            f'grep -Eo "{_IP_REGEX}")')
+
+
+def _get_skyserve_http_test(name: str, cloud: str,
+                            timeout_minutes: int) -> Test:
+    test = Test(
+        f'test-skyserve-{cloud.replace("_", "-")}',
+        [
+            f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'curl http://$endpoint | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=timeout_minutes * 60,
+    )
+    return test
+
+
+def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool,
+                                                                 str]]) -> str:
+    """Check replicas' status and count in sky serve status
+
+    We will check vCPU=2, as all our tests use vCPU=2.
+
+    Args:
+        name: the name of the service
+        check_tuples: A list of replica property to check. Each tuple is
+            (count, is_spot, status)
+    """
+    check_cmd = ''
+    for check_tuple in check_tuples:
+        count, is_spot, status = check_tuple
+        resource_str = ''
+        if status not in ['PENDING', 'SHUTTING_DOWN'
+                         ] and not status.startswith('FAILED'):
+            spot_str = ''
+            if is_spot:
+                spot_str = '\[Spot\]'
+            resource_str = f'({spot_str}vCPU=2)'
+        check_cmd += (f' echo "$s" | grep "{resource_str}" | '
+                      f'grep "{status}" | wc -l | grep {count} || exit 1;')
+    return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd)
+
+
+def _check_service_version(service_name: str, version: str) -> str:
+    # Grep the lines before 'Service Replicas' and check if the service version
+    # is correct.
+    return (f'echo "$s" | grep -B1000 "Service Replicas" | '
+            f'grep -E "{service_name}\s+{version}" || exit 1; ')
+
+
+@pytest.mark.gcp
+@pytest.mark.serve
+def test_skyserve_gcp_http():
+    """Test skyserve on GCP"""
+    name = _get_service_name()
+    test = _get_skyserve_http_test(name, 'gcp', 20)
+    run_one_test(test)
+
+
+@pytest.mark.aws
+@pytest.mark.serve
+def test_skyserve_aws_http():
+    """Test skyserve on AWS"""
+    name = _get_service_name()
+    test = _get_skyserve_http_test(name, 'aws', 20)
+    run_one_test(test)
+
+
+@pytest.mark.azure
+@pytest.mark.serve
+def test_skyserve_azure_http():
+    """Test skyserve on Azure"""
+    name = _get_service_name()
+    test = _get_skyserve_http_test(name, 'azure', 30)
+    run_one_test(test)
+
+
+@pytest.mark.kubernetes
+@pytest.mark.serve
+def test_skyserve_kubernetes_http():
+    """Test skyserve on Kubernetes"""
+    name = _get_service_name()
+    test = _get_skyserve_http_test(name, 'kubernetes', 30)
+    run_one_test(test)
+
+
+@pytest.mark.oci
+@pytest.mark.serve
+def test_skyserve_oci_http():
+    """Test skyserve on OCI"""
+    name = _get_service_name()
+    test = _get_skyserve_http_test(name, 'oci', 20)
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack does not support T4 gpus for now
+@pytest.mark.serve
+def test_skyserve_llm(generic_cloud: str):
+    """Test skyserve with real LLM usecase"""
+    name = _get_service_name()
+
+    def generate_llm_test_command(prompt: str, expected_output: str) -> str:
+        prompt = shlex.quote(prompt)
+        expected_output = shlex.quote(expected_output)
+        return (
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'python tests/skyserve/llm/get_response.py --endpoint $endpoint '
+            f'--prompt {prompt} | grep {expected_output}')
+
+    with open('tests/skyserve/llm/prompt_output.json', 'r',
+              encoding='utf-8') as f:
+        prompt2output = json.load(f)
+
+    test = Test(
+        f'test-skyserve-llm',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            *[
+                generate_llm_test_command(prompt, output)
+                for prompt, output in prompt2output.items()
+            ],
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=40 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.serve
+def test_skyserve_spot_recovery():
+    name = _get_service_name()
+    zone = 'us-central1-a'
+
+    test = Test(
+        f'test-skyserve-spot-recovery-gcp',
+        [
+            f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
+            terminate_gcp_replica(name, zone, 1),
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
+@pytest.mark.serve
+@pytest.mark.no_kubernetes
+def test_skyserve_base_ondemand_fallback(generic_cloud: str):
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-base-ondemand-fallback',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            _check_replica_in_status(name, [(1, True, 'READY'),
+                                            (1, False, 'READY')]),
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.serve
+def test_skyserve_dynamic_ondemand_fallback():
+    name = _get_service_name()
+    zone = 'us-central1-a'
+
+    test = Test(
+        f'test-skyserve-dynamic-ondemand-fallback',
+        [
+            f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml',
+            f'sleep 40',
+            # 2 on-demand (provisioning) + 2 Spot (provisioning).
+            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";'
+            'echo "$s" | grep -q "0/4" || exit 1',
+            # Wait for the provisioning starts
+            f'sleep 40',
+            _check_replica_in_status(name, [
+                (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
+                (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN')
+            ]),
+
+            # Wait until 2 spot instances are ready.
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            _check_replica_in_status(name, [(2, True, 'READY'),
+                                            (0, False, '')]),
+            terminate_gcp_replica(name, zone, 1),
+            f'sleep 40',
+            # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning).
+            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
+            'echo "$s" | grep -q "1/3"',
+            _check_replica_in_status(
+                name, [(1, True, 'READY'),
+                       (1, True, _SERVICE_LAUNCHING_STATUS_REGEX),
+                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
+
+            # Wait until 2 spot instances are ready.
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            _check_replica_in_status(name, [(2, True, 'READY'),
+                                            (0, False, '')]),
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
+@pytest.mark.no_fluidstack
+@pytest.mark.serve
+def test_skyserve_user_bug_restart(generic_cloud: str):
+    """Tests that we restart the service after user bug."""
+    # TODO(zhwu): this behavior needs some rethinking.
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-user-bug-restart',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml',
+            f's=$(sky serve status {name}); echo "$s";'
+            'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; '
+            'do echo "Waiting for first service to be SHUTTING DOWN..."; '
+            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ',
+            f's=$(sky serve status {name}); echo "$s";'
+            'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; '
+            'do echo "Waiting for first service to be FAILED..."; '
+            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; '
+            + _check_replica_in_status(name, [(1, True, 'FAILED')]) +
+            # User bug failure will cause no further scaling.
+            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; '
+            f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"',
+            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml',
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; '
+            + _check_replica_in_status(name, [(1, False, 'READY'),
+                                              (1, False, 'FAILED')]),
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+@pytest.mark.no_kubernetes  # Replicas on k8s may be running on the same node and have the same public IP
+def test_skyserve_load_balancer(generic_cloud: str):
+    """Test skyserve load balancer round-robin policy"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-load-balancer',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
+            f'{_get_replica_ip(name, 1)}; '
+            f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; '
+            'python tests/skyserve/load_balancer/test_round_robin.py '
+            '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.gcp
+@pytest.mark.serve
+@pytest.mark.no_kubernetes
+def test_skyserve_auto_restart():
+    """Test skyserve with auto restart"""
+    name = _get_service_name()
+    zone = 'us-central1-a'
+    test = Test(
+        f'test-skyserve-auto-restart',
+        [
+            # TODO(tian): we can dynamically generate YAML from template to
+            # avoid maintaining too many YAML files
+            f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
+            # sleep for 20 seconds (initial delay) to make sure it will
+            # be restarted
+            f'sleep 20',
+            terminate_gcp_replica(name, zone, 1),
+            # Wait for consecutive failure timeout passed.
+            # If the cluster is not using spot, it won't check the cluster status
+            # on the cloud (since manual shutdown is not a common behavior and such
+            # queries takes a lot of time). Instead, we think continuous 3 min probe
+            # failure is not a temporary problem but indeed a failure.
+            'sleep 180',
+            # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time
+            # that the output of `sky serve status` shows FAILED and this status will
+            # cause _SERVE_WAIT_UNTIL_READY to early quit.
+            '(while true; do'
+            f'    output=$(sky serve status {name});'
+            '     echo "$output" | grep -q "1/1" && break;'
+            '     sleep 10;'
+            f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};',
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+def test_skyserve_cancel(generic_cloud: str):
+    """Test skyserve with cancel"""
+    name = _get_service_name()
+
+    test = Test(
+        f'test-skyserve-cancel',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 '
+            'tests/skyserve/cancel/send_cancel_request.py '
+            '--endpoint $endpoint | grep "Request was cancelled"',
+            f's=$(sky serve logs {name} 1 --no-follow); '
+            'until ! echo "$s" | grep "Please wait for the controller to be"; '
+            'do echo "Waiting for serve logs"; sleep 10; '
+            f's=$(sky serve logs {name} 1 --no-follow); done; '
+            'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+def test_skyserve_streaming(generic_cloud: str):
+    """Test skyserve with streaming"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-streaming',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'python3 tests/skyserve/streaming/send_streaming_request.py '
+            '--endpoint $endpoint | grep "Streaming test passed"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+def test_skyserve_readiness_timeout_fail(generic_cloud: str):
+    """Test skyserve with large readiness probe latency, expected to fail"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-readiness-timeout-fail',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml',
+            # None of the readiness probe will pass, so the service will be
+            # terminated after the initial delay.
+            f's=$(sky serve status {name}); '
+            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
+            'echo "Waiting for replica to be failed..."; sleep 5; '
+            f's=$(sky serve status {name}); echo "$s"; done;',
+            'sleep 60',
+            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;'
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+def test_skyserve_large_readiness_timeout(generic_cloud: str):
+    """Test skyserve with customized large readiness timeout"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-large-readiness-timeout',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
+@pytest.mark.no_fluidstack
+@pytest.mark.serve
+def test_skyserve_update(generic_cloud: str):
+    """Test skyserve with update"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-update',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
+            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml',
+            # sleep before update is registered.
+            'sleep 20',
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;'
+            # Make sure the traffic is not mixed
+            'curl http://$endpoint | grep "Hi, new SkyPilot here"',
+            # The latest 2 version should be READY and the older versions should be shutting down
+            (_check_replica_in_status(name, [(2, False, 'READY'),
+                                             (2, False, 'SHUTTING_DOWN')]) +
+             _check_service_version(name, "2")),
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
+@pytest.mark.no_fluidstack
+@pytest.mark.serve
+def test_skyserve_rolling_update(generic_cloud: str):
+    """Test skyserve with rolling update"""
+    name = _get_service_name()
+    single_new_replica = _check_replica_in_status(
+        name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
+               (1, False, 'SHUTTING_DOWN')])
+    test = Test(
+        f'test-skyserve-rolling-update',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
+            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml',
+            # Make sure the traffic is mixed across two versions, the replicas
+            # with even id will sleep 60 seconds before being ready, so we
+            # should be able to get observe the period that the traffic is mixed
+            # across two versions.
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; '
+            # The latest version should have one READY and the one of the older versions should be shutting down
+            f'{single_new_replica} {_check_service_version(name, "1,2")} '
+            # Check the output from the old version, immediately after the
+            # output from the new version appears. This is guaranteed by the
+            # round robin load balancing policy.
+            # TODO(zhwu): we should have a more generalized way for checking the
+            # mixed version of replicas to avoid depending on the specific
+            # round robin load balancing policy.
+            'curl http://$endpoint | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack
+@pytest.mark.serve
+def test_skyserve_fast_update(generic_cloud: str):
+    """Test skyserve with fast update (Increment version of old replicas)"""
+    name = _get_service_name()
+
+    test = Test(
+        f'test-skyserve-fast-update',
+        [
+            f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
+            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml',
+            # sleep to wait for update to be registered.
+            'sleep 40',
+            # 2 on-deamnd (ready) + 1 on-demand (provisioning).
+            (
+                _check_replica_in_status(
+                    name, [(2, False, 'READY'),
+                           (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) +
+                # Fast update will directly have the latest version ready.
+                _check_service_version(name, "2")),
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) +
+            _check_service_version(name, "2"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
+            # Test rolling update
+            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml',
+            # sleep to wait for update to be registered.
+            'sleep 25',
+            # 2 on-deamnd (ready) + 1 on-demand (shutting down).
+            _check_replica_in_status(name, [(2, False, 'READY'),
+                                            (1, False, 'SHUTTING_DOWN')]),
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+            _check_service_version(name, "3"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.serve
+def test_skyserve_update_autoscale(generic_cloud: str):
+    """Test skyserve update with autoscale"""
+    name = _get_service_name()
+    test = Test(
+        f'test-skyserve-update-autoscale',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+            _check_service_version(name, "1"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'curl http://$endpoint | grep "Hi, SkyPilot here"',
+            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml',
+            # sleep before update is registered.
+            'sleep 20',
+            # Timeout will be triggered when update fails.
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) +
+            _check_service_version(name, "2"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
+            # Rolling Update
+            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
+            # sleep before update is registered.
+            'sleep 20',
+            # Timeout will be triggered when update fails.
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+            _check_service_version(name, "3"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.no_fluidstack  # Spot instances are note supported by Fluidstack
+@pytest.mark.serve
+@pytest.mark.no_kubernetes  # Spot instances are not supported in Kubernetes
+@pytest.mark.parametrize('mode', ['rolling', 'blue_green'])
+def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
+    """Test skyserve with update that changes autoscaler"""
+    name = f'{_get_service_name()}-{mode}'
+
+    wait_until_no_pending = (
+        f's=$(sky serve status {name}); echo "$s"; '
+        'until ! echo "$s" | grep PENDING; do '
+        '  echo "Waiting for replica to be out of pending..."; '
+        f' sleep 5; s=$(sky serve status {name}); '
+        '  echo "$s"; '
+        'done')
+    four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')])
+    update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;']
+    if mode == 'rolling':
+        # Check rolling update, it will terminate one of the old on-demand
+        # instances, once there are 4 spot instance ready.
+        update_check += [
+            _check_replica_in_status(
+                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
+                       (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) +
+            _check_service_version(name, "1,2"),
+        ]
+    else:
+        # Check blue green update, it will keep both old on-demand instances
+        # running, once there are 4 spot instance ready.
+        update_check += [
+            _check_replica_in_status(
+                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
+                       (2, False, 'READY')]) +
+            _check_service_version(name, "1"),
+        ]
+    test = Test(
+        f'test-skyserve-new-autoscaler-update-{mode}',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml',
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+            _check_service_version(name, "1"),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"',
+            f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml',
+            # Wait for update to be registered
+            f'sleep 90',
+            wait_until_no_pending,
+            _check_replica_in_status(
+                name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
+                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
+                       (2, False, 'READY')]),
+            *update_check,
+            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5),
+            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+            'curl http://$endpoint | grep "Hi, SkyPilot here"',
+            _check_replica_in_status(name, [(4, True, 'READY'),
+                                            (1, False, 'READY')]),
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
+@pytest.mark.no_fluidstack
+@pytest.mark.serve
+def test_skyserve_failures(generic_cloud: str):
+    """Test replica failure statuses"""
+    name = _get_service_name()
+
+    test = Test(
+        'test-skyserve-failures',
+        [
+            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml',
+            f's=$(sky serve status {name}); '
+            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
+            'echo "Waiting for replica to be failed..."; sleep 5; '
+            f's=$(sky serve status {name}); echo "$s"; done;',
+            'sleep 60',
+            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; '
+            # Make sure no new replicas are started for early failure.
+            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;',
+            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml',
+            f's=$(sky serve status {name}); '
+            # Wait for replica to be ready.
+            f'until echo "$s" | grep "READY"; do '
+            'echo "Waiting for replica to be failed..."; sleep 5; '
+            f's=$(sky serve status {name}); echo "$s"; done;',
+            # Wait for replica to change to FAILED_PROBING
+            f's=$(sky serve status {name}); '
+            f'until echo "$s" | grep "FAILED_PROBING"; do '
+            'echo "Waiting for replica to be failed..."; sleep 5; '
+            f's=$(sky serve status {name}); echo "$s"; done',
+            # Wait for the PENDING replica to appear.
+            'sleep 10',
+            # Wait until the replica is out of PENDING.
+            f's=$(sky serve status {name}); '
+            f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do '
+            'echo "Waiting for replica to be out of pending..."; sleep 5; '
+            f's=$(sky serve status {name}); echo "$s"; done; ' +
+            _check_replica_in_status(
+                name, [(1, False, 'FAILED_PROBING'),
+                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
+            # TODO(zhwu): add test for FAILED_PROVISION
+        ],
+        _TEARDOWN_SERVICE.format(name=name),
+        timeout=20 * 60,
+    )
+    run_one_test(test)
+
+
+# TODO(Ziming, Tian): Add tests for autoscaling.
+
+
+# ------- Testing user dependencies --------
+def test_user_dependencies(generic_cloud: str):
+    name = get_cluster_name()
+    test = Test(
+        'user-dependencies',
+        [
+            f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"',
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} "echo hi"',
+            f'sky logs {name} 2 --status',
+            f'sky status -r {name} | grep UP',
+            f'sky exec {name} "echo bye"',
+            f'sky logs {name} 3 --status',
+            f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml',
+            f'sky logs {name} 4 --status',
+            # Launch again to test the default env does not affect SkyPilot
+            # runtime setup
+            f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"',
+            f'sky logs {name} 5 --status',
+        ],
+        f'sky down -y {name}',
+    )
+    run_one_test(test)
diff --git a/tests/smoke_tests/test_smoke.py b/tests/smoke_tests/test_smoke.py
deleted file mode 100644
index 348c880d7a7..00000000000
--- a/tests/smoke_tests/test_smoke.py
+++ /dev/null
@@ -1,5077 +0,0 @@
-# Smoke tests for SkyPilot
-# Default options are set in pyproject.toml
-# Example usage:
-# Run all tests except for AWS and Lambda Cloud
-# > pytest tests/test_smoke.py
-#
-# Terminate failed clusters after test finishes
-# > pytest tests/test_smoke.py --terminate-on-failure
-#
-# Re-run last failed tests
-# > pytest --lf
-#
-# Run one of the smoke tests
-# > pytest tests/test_smoke.py::test_minimal
-#
-# Only run managed job tests
-# > pytest tests/test_smoke.py --managed-jobs
-#
-# Only run sky serve tests
-# > pytest tests/test_smoke.py --sky-serve
-#
-# Only run test for AWS + generic tests
-# > pytest tests/test_smoke.py --aws
-#
-# Change cloud for generic tests to aws
-# > pytest tests/test_smoke.py --generic-cloud aws
-
-import inspect
-import json
-import os
-import pathlib
-import shlex
-import shutil
-import subprocess
-import sys
-import tempfile
-import textwrap
-import time
-from typing import Dict, List, Optional, Tuple
-import urllib.parse
-import uuid
-
-import jinja2
-import pytest
-from smoke_tests.util import _BUMP_UP_SECONDS
-from smoke_tests.util import _get_cluster_name
-from smoke_tests.util import _GET_JOB_QUEUE
-from smoke_tests.util import _get_timeout
-from smoke_tests.util import _JOB_WAIT_NOT_RUNNING
-from smoke_tests.util import _terminate_gcp_replica
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
-from smoke_tests.util import (
-    _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB)
-from smoke_tests.util import (
-    _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
-from smoke_tests.util import get_aws_region_for_quota_failover
-from smoke_tests.util import get_gcp_region_for_quota_failover
-from smoke_tests.util import LAMBDA_TYPE
-from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_GPU_V100
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
-from smoke_tests.util import Test
-from smoke_tests.util import test_id
-
-import sky
-from sky import global_user_state
-from sky import jobs
-from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
-from sky.clouds import AWS
-from sky.clouds import Azure
-from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
-from sky.jobs.state import ManagedJobStatus
-from sky.skylet import constants
-from sky.skylet import events
-from sky.skylet.job_lib import JobStatus
-from sky.status_lib import ClusterStatus
-from sky.utils import common_utils
-from sky.utils import resources_utils
-from sky.utils import subprocess_utils
-
-
-# ------------ Test stale job ------------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
-@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
-def test_stale_job(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'stale_job',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=100),
-            f'sky start {name} -y',
-            f'sky logs {name} 1 --status',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_stale_job_manual_restart():
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.AWS.max_cluster_name_length())
-    region = 'us-east-2'
-    test = Test(
-        'aws_stale_job_manual_restart',
-        [
-            f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            # Stop the cluster manually.
-            f'id=`aws ec2 describe-instances --region {region} --filters '
-            f'Name=tag:ray-cluster-name,Values={name_on_cloud} '
-            f'--query Reservations[].Instances[].InstanceId '
-            '--output text`; '
-            f'aws ec2 stop-instances --region {region} '
-            '--instance-ids $id',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=40),
-            f'sky launch -c {name} -y "echo hi"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 3 --status',
-            # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
-                cluster_name=name,
-                job_status=JobStatus.FAILED_DRIVER.value,
-                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_stale_job_manual_restart():
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.GCP.max_cluster_name_length())
-    zone = 'us-west2-a'
-    query_cmd = (f'gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name={name_on_cloud})" '
-                 f'--zones={zone} --format="value(name)"')
-    stop_cmd = (f'gcloud compute instances stop --zone={zone}'
-                f' --quiet $({query_cmd})')
-    test = Test(
-        'gcp_stale_job_manual_restart',
-        [
-            f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            # Stop the cluster manually.
-            stop_cmd,
-            'sleep 40',
-            f'sky launch -c {name} -y "echo hi"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 3 --status',
-            # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
-                cluster_name=name,
-                job_status=JobStatus.FAILED_DRIVER.value,
-                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Check Sky's environment variables; workdir. ----------
-@pytest.mark.no_fluidstack  # Requires amazon S3
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_env_check(generic_cloud: str):
-    name = _get_cluster_name()
-    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = Test(
-        'env_check',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- file_mounts ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead.
-def test_file_mounts(generic_cloud: str):
-    name = _get_cluster_name()
-    extra_flags = ''
-    if generic_cloud in 'kubernetes':
-        # Kubernetes does not support multi-node
-        # NOTE: This test will fail if you have a Kubernetes cluster running on
-        #  arm64 (e.g., Apple Silicon) since goofys does not work on arm64.
-        extra_flags = '--num-nodes 1'
-    test_commands = [
-        *STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml',
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-    ]
-    test = Test(
-        'using_file_mounts',
-        test_commands,
-        f'sky down -y {name}',
-        _get_timeout(generic_cloud, 20 * 60),  # 20 mins
-    )
-    run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_file_mounts():
-    name = _get_cluster_name()
-    test_commands = [
-        *STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml',
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-    ]
-    test = Test(
-        'SCP_using_file_mounts',
-        test_commands,
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Requires GCP to be enabled
-def test_using_file_mounts_with_env_vars(generic_cloud: str):
-    name = _get_cluster_name()
-    storage_name = TestStorageWithCredentials.generate_bucket_name()
-    test_commands = [
-        *STORAGE_SETUP_COMMANDS,
-        (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} '
-         'examples/using_file_mounts_with_env_vars.yaml '
-         f'--env MY_BUCKET={storage_name}'),
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        # Override with --env:
-        (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} '
-         'examples/using_file_mounts_with_env_vars.yaml '
-         f'--env MY_BUCKET={storage_name} '
-         '--env MY_LOCAL_PATH=tmpfile'),
-        f'sky logs {name}-2 1 --status',  # Ensure the job succeeded.
-    ]
-    test = Test(
-        'using_file_mounts_with_env_vars',
-        test_commands,
-        (f'sky down -y {name} {name}-2',
-         f'sky storage delete -y {storage_name} {storage_name}-2'),
-        timeout=20 * 60,  # 20 mins
-    )
-    run_one_test(test)
-
-
-# ---------- storage ----------
-@pytest.mark.aws
-def test_aws_storage_mounts_with_stop():
-    name = _get_cluster_name()
-    cloud = 'aws'
-    storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'aws s3 ls {storage_name}/hello.txt',
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
-        test = Test(
-            'aws_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_storage_mounts_with_stop():
-    name = _get_cluster_name()
-    cloud = 'gcp'
-    storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'gsutil ls gs://{storage_name}/hello.txt',
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
-        test = Test(
-            'gcp_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_storage_mounts_with_stop():
-    name = _get_cluster_name()
-    cloud = 'azure'
-    storage_name = f'sky-test-{int(time.time())}'
-    default_region = 'eastus'
-    storage_account_name = (storage_lib.AzureBlobStore.
-                            get_default_storage_account_name(default_region))
-    storage_account_key = data_utils.get_az_storage_account_key(
-        storage_account_name)
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)'
-            # if the file does not exist, az storage blob list returns '[]'
-            f'[ "$output" = "[]" ] && exit 1;'
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
-        test = Test(
-            'azure_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_storage_mounts():
-    # Tests bucket mounting on k8s, assuming S3 is configured.
-    # This test will fail if run on non x86_64 architecture, since goofys is
-    # built for x86_64 only.
-    name = _get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud kubernetes {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'aws s3 ls {storage_name}/hello.txt || '
-            f'gsutil ls gs://{storage_name}/hello.txt',
-        ]
-        test = Test(
-            'kubernetes_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_context_switch():
-    name = _get_cluster_name()
-    new_context = f'sky-test-context-{int(time.time())}'
-    new_namespace = f'sky-test-namespace-{int(time.time())}'
-
-    test_commands = [
-        # Launch a cluster and run a simple task
-        f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"',
-        f'sky logs {name} 1 --status',  # Ensure job succeeded
-
-        # Get current context details and save to a file for later use in cleanup
-        'CURRENT_CONTEXT=$(kubectl config current-context); '
-        'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; '
-        'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); '
-        'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); '
-
-        # Create a new context with a different name and namespace
-        f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}',
-
-        # Create the new namespace if it doesn't exist
-        f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -',
-
-        # Set the new context as active
-        f'kubectl config use-context {new_context}',
-
-        # Verify the new context is active
-        f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1',
-
-        # Try to run sky exec on the original cluster (should still work)
-        f'sky exec {name} "echo Success: sky exec works after context switch"',
-
-        # Test sky queue
-        f'sky queue {name}',
-
-        # Test SSH access
-        f'ssh {name} whoami',
-    ]
-
-    cleanup_commands = (
-        f'kubectl delete namespace {new_namespace}; '
-        f'kubectl config delete-context {new_context}; '
-        'kubectl config use-context $(cat /tmp/sky_test_current_context); '
-        'rm /tmp/sky_test_current_context; '
-        f'sky down -y {name}')
-
-    test = Test(
-        'kubernetes_context_switch',
-        test_commands,
-        cleanup_commands,
-        timeout=20 * 60,  # 20 mins
-    )
-    run_one_test(test)
-
-
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-    ])
-def test_docker_storage_mounts(generic_cloud: str, image_id: str):
-    # Tests bucket mounting on docker container
-    name = _get_cluster_name()
-    timestamp = str(time.time()).replace('.', '')
-    storage_name = f'sky-test-{timestamp}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3.
-    azure_mount_unsupported_ubuntu_version = '18.04'
-    # Commands to verify bucket upload. We need to check all three
-    # storage types because the optimizer may pick any of them.
-    s3_command = f'aws s3 ls {storage_name}/hello.txt'
-    gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt'
-    azure_blob_command = TestStorageWithCredentials.cli_ls_cmd(
-        storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt')
-    if azure_mount_unsupported_ubuntu_version in image_id:
-        # The store for mount_private_mount is not specified in the template.
-        # If we're running on Azure, the private mount will be created on
-        # azure blob. That will not be supported on the ubuntu 18.04 image
-        # and thus fail. For other clouds, the private mount on other
-        # storage types (GCS/S3) should succeed.
-        include_private_mount = False if generic_cloud == 'azure' else True
-        content = template.render(storage_name=storage_name,
-                                  include_azure_mount=False,
-                                  include_private_mount=include_private_mount)
-    else:
-        content = template.render(storage_name=storage_name,)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            # Check AWS, GCP, or Azure storage mount.
-            f'{s3_command} || '
-            f'{gsutil_command} || '
-            f'{azure_blob_command}',
-        ]
-        test = Test(
-            'docker_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.cloudflare
-def test_cloudflare_storage_mounts(generic_cloud: str):
-    name = _get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_r2_storage_mounting.yaml').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
-    endpoint_url = cloudflare.create_endpoint()
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2'
-        ]
-
-        test = Test(
-            'cloudflare_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_storage_mounts():
-    name = _get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-        storage_name, Rclone.RcloneClouds.IBM)
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud ibm {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt',
-        ]
-        test = Test(
-            'ibm_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        run_one_test(test)
-
-
-# ---------- CLI logs ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead.
-def test_cli_logs(generic_cloud: str):
-    name = _get_cluster_name()
-    num_nodes = 2
-    if generic_cloud == 'kubernetes':
-        # Kubernetes does not support multi-node
-        num_nodes = 1
-    timestamp = time.time()
-    test = Test('cli_logs', [
-        f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"',
-        f'sky exec {name} "echo {timestamp} 2"',
-        f'sky exec {name} "echo {timestamp} 3"',
-        f'sky exec {name} "echo {timestamp} 4"',
-        f'sky logs {name} 2 --status',
-        f'sky logs {name} 3 4 --sync-down',
-        f'sky logs {name} * --sync-down',
-        f'sky logs {name} 1 | grep "{timestamp} 1"',
-        f'sky logs {name} | grep "{timestamp} 4"',
-    ], f'sky down -y {name}')
-    run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_logs():
-    name = _get_cluster_name()
-    timestamp = time.time()
-    test = Test(
-        'SCP_cli_logs',
-        [
-            f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"',
-            f'sky exec {name} "echo {timestamp} 2"',
-            f'sky exec {name} "echo {timestamp} 3"',
-            f'sky exec {name} "echo {timestamp} 4"',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 4 --sync-down',
-            f'sky logs {name} * --sync-down',
-            f'sky logs {name} 1 | grep "{timestamp} 1"',
-            f'sky logs {name} | grep "{timestamp} 4"',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Job Queue. ----------
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead
-@pytest.mark.no_scp  # SCP does not have T4 gpus. Run test_scp_job_queue instead
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
-@pytest.mark.no_oci  # OCI does not have T4 gpus
-def test_job_queue(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'job_queue',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Job Queue with Docker. ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support docker for now
-@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
-@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
-@pytest.mark.no_paperspace  # Paperspace doesn't have T4 GPUs
-@pytest.mark.no_scp  # Doesn't support SCP for now
-@pytest.mark.no_oci  # Doesn't support OCI for now
-@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test latest image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-        # Axolotl image is a good example custom image that has its conda path
-        # set in PATH with dockerfile and uses python>=3.12. It could test:
-        #  1. we handle the env var set in dockerfile correctly
-        #  2. python>=3.12 works with SkyPilot runtime.
-        'docker:winglian/axolotl:main-latest'
-    ])
-def test_job_queue_with_docker(generic_cloud: str, image_id: str):
-    name = _get_cluster_name() + image_id[len('docker:'):][:4]
-    total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
-    time_to_sleep = 300 if generic_cloud == 'azure' else 180
-    test = Test(
-        'job_queue_with_docker',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
-            f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-            # Make sure the GPU is still visible to the container.
-            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
-            f'sky logs {name} 4 --status',
-            f'sky stop -y {name}',
-            # Make sure the job status preserve after stop and start the
-            # cluster. This is also a test for the docker container to be
-            # preserved after stop and start.
-            f'sky start -y {name}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            # Make sure it is still visible after an stop & start cycle.
-            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
-            f'sky logs {name} 7 --status'
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.lambda_cloud
-def test_lambda_job_queue():
-    name = _get_cluster_name()
-    test = Test(
-        'lambda_job_queue',
-        [
-            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_job_queue():
-    name = _get_cluster_name()
-    test = Test(
-        'ibm_job_queue',
-        [
-            f'sky launch -y -c {name} --cloud ibm --gpus v100',
-            f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_job_queue():
-    name = _get_cluster_name()
-    num_of_gpu_launch = 1
-    num_of_gpu_exec = 0.5
-    test = Test(
-        'SCP_job_queue',
-        [
-            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus.
-@pytest.mark.no_kubernetes  # Kubernetes not support num_nodes > 1 yet
-def test_job_queue_multinode(generic_cloud: str):
-    name = _get_cluster_name()
-    total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
-    test = Test(
-        'job_queue_multinode',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml',
-            f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml',
-            f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml',
-            f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
-            'sleep 90',
-            f'sky cancel -y {name} 1',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP',
-            f'sky cancel -y {name} 1 2 3',
-            f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml',
-            # Test the job status is correctly set to SETTING_UP, during the setup is running,
-            # and the job can be cancelled during the setup.
-            'sleep 5',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
-            f'sky cancel -y {name} 4',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
-@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
-def test_large_job_queue(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'large_job_queue',
-        [
-            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
-            f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done',
-            f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16',
-            'sleep 90',
-
-            # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running.
-            # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING.
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43',
-            # Make sure the jobs are scheduled in FIFO order
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
-                for i in range(1, 17)
-            ],
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
-                for i in range(17, 33)
-            ],
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING'
-                for i in range(33, 75)
-            ],
-            f'sky cancel -y {name} 33 35 37 39 17 18 19',
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
-                for i in range(33, 40, 2)
-            ],
-            'sleep 10',
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
-                for i in [34, 36, 38]
-            ],
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
-@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
-def test_fast_large_job_queue(generic_cloud: str):
-    # This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
-    name = _get_cluster_name()
-    test = Test(
-        'fast_large_job_queue',
-        [
-            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
-            f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done',
-            'sleep 60',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_job_queue_multinode():
-    name = _get_cluster_name()
-    task_file = 'examples/job_queue/job_multinode_ibm.yaml'
-    test = Test(
-        'ibm_job_queue_multinode',
-        [
-            f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2',
-            f'sky exec {name} -n {name}-1 -d {task_file}',
-            f'sky exec {name} -n {name}-2 -d {task_file}',
-            f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)',
-            'sleep 90',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
-            f'sky cancel -y {name} 1',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 1 2 3',
-            f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}',
-            # Test the job status is correctly set to SETTING_UP, during the setup is running,
-            # and the job can be cancelled during the setup.
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
-            f'sky cancel -y {name} 4',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
-            f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins
-    )
-    run_one_test(test)
-
-
-# ---------- Docker with preinstalled package. ----------
-@pytest.mark.no_fluidstack  # Doesn't support Fluidstack for now
-@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
-@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
-@pytest.mark.no_scp  # Doesn't support SCP for now
-@pytest.mark.no_oci  # Doesn't support OCI for now
-@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
-# TODO(zhwu): we should fix this for kubernetes
-def test_docker_preinstalled_package(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'docker_with_preinstalled_package',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx',
-            f'sky exec {name} "nginx -V"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} whoami | grep root',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Submitting multiple tasks to the same cluster. ----------
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus
-def test_multi_echo(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'multi_echo',
-        [
-            f'python examples/multi_echo.py {name} {generic_cloud}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 10',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 30',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 30',
-            # Make sure that our job scheduler is fast enough to have at least
-            # 10 RUNNING jobs in parallel.
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'',
-            'sleep 30',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done',
-        ] +
-        # Ensure jobs succeeded.
-        [
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
-                cluster_name=name,
-                job_id=i + 1,
-                job_status=JobStatus.SUCCEEDED.value,
-                timeout=120) for i in range(32)
-        ] +
-        # Ensure monitor/autoscaler didn't crash on the 'assert not
-        # unfulfilled' error.  If process not found, grep->ssh returns 1.
-        [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- Task: 1 node training. ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not have T4 gpus for now
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_scp  # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
-def test_huggingface(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.lambda_cloud
-def test_lambda_huggingface(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'lambda_huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_huggingface(generic_cloud: str):
-    name = _get_cluster_name()
-    num_of_gpu_launch = 1
-    test = Test(
-        'SCP_huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Inferentia. ----------
-@pytest.mark.aws
-def test_inferentia():
-    name = _get_cluster_name()
-    test = Test(
-        'test_inferentia',
-        [
-            f'sky launch -y -c {name} -t inf2.xlarge -- echo hi',
-            f'sky exec {name} --gpus Inferentia:1 echo hi',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- TPU. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu():
-    name = _get_cluster_name()
-    test = Test(
-        'tpu_app',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"',  # Ensure sky launch won't create another TPU.
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take >20 mins
-    )
-    run_one_test(test)
-
-
-# ---------- TPU VM. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu_vm():
-    name = _get_cluster_name()
-    test = Test(
-        'tpu_vm_app',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',  # Ensure the cluster is STOPPED.
-            # Use retry: guard against transient errors observed for
-            # just-stopped TPU VMs (#962).
-            f'sky start --retry-until-up -y {name}',
-            f'sky exec {name} examples/tpu/tpuvm_mnist.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    run_one_test(test)
-
-
-# ---------- TPU VM Pod. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu_vm_pod():
-    name = _get_cluster_name()
-    test = Test(
-        'tpu_pod',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    run_one_test(test)
-
-
-# ---------- TPU Pod Slice on GKE. ----------
-@pytest.mark.kubernetes
-def test_tpu_pod_slice_gke():
-    name = _get_cluster_name()
-    test = Test(
-        'tpu_pod_slice_gke',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"',  # Ensure TPU is reachable.
-            f'sky logs {name} 2 --status'
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    run_one_test(test)
-
-
-# ---------- Simple apps. ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_multi_hostname(generic_cloud: str):
-    name = _get_cluster_name()
-    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = Test(
-        'multi_hostname',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2',  # Ensure there are 2 hosts.
-            f'sky exec {name} examples/multi_hostname.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=_get_timeout(generic_cloud, total_timeout_minutes * 60),
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_multi_node_failure(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'multi_node_failure',
-        [
-            # TODO(zhwu): we use multi-thread to run the commands in setup
-            # commands in parallel, which makes it impossible to fail fast
-            # when one of the nodes fails. We should fix this in the future.
-            # The --detach-setup version can fail fast, as the setup is
-            # submitted to the remote machine, which does not use multi-thread.
-            # Refer to the comment in `subprocess_utils.run_in_parallel`.
-            # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1',  # Ensure the job setup failed.
-            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml',
-            f'sky logs {name} 1 --status | grep FAILED_SETUP',  # Ensure the job setup failed.
-            f'sky exec {name} tests/test_yamls/failed_worker_run.yaml',
-            f'sky logs {name} 2 --status | grep FAILED',  # Ensure the job failed.
-            f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2',  # Ensure there 2 of the hosts printed their hostname.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on GCP. ----------
-@pytest.mark.gcp
-def test_gcp_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on AWS. ----------
-@pytest.mark.aws
-def test_aws_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'aws_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Azure. ----------
-@pytest.mark.azure
-def test_azure_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'azure_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Kubernetes. ----------
-@pytest.mark.kubernetes
-def test_kubernetes_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'kubernetes_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Paperspace. ----------
-@pytest.mark.paperspace
-def test_paperspace_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'paperspace_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Web apps with custom ports on RunPod. ----------
-@pytest.mark.runpod
-def test_runpod_http_server_with_custom_ports():
-    name = _get_cluster_name()
-    test = Test(
-        'runpod_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Labels from task on AWS (instance_tags) ----------
-@pytest.mark.aws
-def test_task_labels_aws():
-    name = _get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='aws', region='us-east-1')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = Test(
-            'task_labels_aws',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with aws cli that the tags are set.
-                'aws ec2 describe-instances '
-                '--query "Reservations[*].Instances[*].InstanceId" '
-                '--filters "Name=instance-state-name,Values=running" '
-                f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" '
-                '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" '
-                '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" '
-                '--region us-east-1 --output text',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-# ---------- Labels from task on GCP (labels) ----------
-@pytest.mark.gcp
-def test_task_labels_gcp():
-    name = _get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='gcp')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = Test(
-            'task_labels_gcp',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with gcloud cli that the tags are set
-                f'gcloud compute instances list --filter="name~\'^{name}\' AND '
-                'labels.inlinelabel1=\'inlinevalue1\' AND '
-                'labels.inlinelabel2=\'inlinevalue2\'" '
-                '--format="value(name)" | grep .',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-# ---------- Labels from task on Kubernetes (labels) ----------
-@pytest.mark.kubernetes
-def test_task_labels_kubernetes():
-    name = _get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='kubernetes')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = Test(
-            'task_labels_kubernetes',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with kubectl that the labels are set.
-                'kubectl get pods '
-                '--selector inlinelabel1=inlinevalue1 '
-                '--selector inlinelabel2=inlinevalue2 '
-                '-o jsonpath=\'{.items[*].metadata.name}\' | '
-                f'grep \'^{name}\''
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-# ---------- Pod Annotations on Kubernetes ----------
-@pytest.mark.kubernetes
-def test_add_pod_annotations_for_autodown_with_launch():
-    name = _get_cluster_name()
-    test = Test(
-        'add_pod_annotations_for_autodown_with_launch',
-        [
-            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
-            # Autodown is set.
-            f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes',
-            # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
-            # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop'
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_add_and_remove_pod_annotations_with_autostop():
-    name = _get_cluster_name()
-    test = Test(
-        'add_and_remove_pod_annotations_with_autostop',
-        [
-            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
-            f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes',
-            # Set autodown on the cluster with 'autostop' command.
-            f'sky autostop -y {name} -i 20 --down',
-            # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
-            # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Cancel the set autodown to remove the annotations from the pods.
-            f'sky autostop -y {name} --cancel',
-            # Describe the first pod and check if annotations are removed.
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check if annotations are removed.
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Container logs from task on Kubernetes ----------
-@pytest.mark.kubernetes
-def test_container_logs_multinode_kubernetes():
-    name = _get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
-    head_logs = ('kubectl get pods '
-                 f' | grep {name} |  grep head | '
-                 " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    worker_logs = ('kubectl get pods '
-                   f' | grep {name} |  grep worker |'
-                   " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
-            'container_logs_multinode_kubernetes',
-            [
-                f'sky launch -y -c {name} {task_yaml} --num-nodes 2',
-                f'{head_logs} | wc -l | grep 9',
-                f'{worker_logs} | wc -l | grep 9',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_container_logs_two_jobs_kubernetes():
-    name = _get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
-    pod_logs = ('kubectl get pods '
-                f' | grep {name} |  grep head |'
-                " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
-            'test_container_logs_two_jobs_kubernetes',
-            [
-                f'sky launch -y -c {name} {task_yaml}',
-                f'{pod_logs} | wc -l | grep 9',
-                f'sky launch -y -c {name} {task_yaml}',
-                f'{pod_logs} | wc -l | grep 18',
-                f'{pod_logs} | grep 1 | wc -l | grep 2',
-                f'{pod_logs} | grep 2 | wc -l | grep 2',
-                f'{pod_logs} | grep 3 | wc -l | grep 2',
-                f'{pod_logs} | grep 4 | wc -l | grep 2',
-                f'{pod_logs} | grep 5 | wc -l | grep 2',
-                f'{pod_logs} | grep 6 | wc -l | grep 2',
-                f'{pod_logs} | grep 7 | wc -l | grep 2',
-                f'{pod_logs} | grep 8 | wc -l | grep 2',
-                f'{pod_logs} | grep 9 | wc -l | grep 2',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_container_logs_two_simultaneous_jobs_kubernetes():
-    name = _get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml '
-    pod_logs = ('kubectl get pods '
-                f' | grep {name} |  grep head |'
-                " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
-            'test_container_logs_two_simultaneous_jobs_kubernetes',
-            [
-                f'sky launch -y -c {name}',
-                f'sky exec -c {name} -d {task_yaml}',
-                f'sky exec -c {name} -d {task_yaml}',
-                'sleep 30',
-                f'{pod_logs} | wc -l | grep 18',
-                f'{pod_logs} | grep 1 | wc -l | grep 2',
-                f'{pod_logs} | grep 2 | wc -l | grep 2',
-                f'{pod_logs} | grep 3 | wc -l | grep 2',
-                f'{pod_logs} | grep 4 | wc -l | grep 2',
-                f'{pod_logs} | grep 5 | wc -l | grep 2',
-                f'{pod_logs} | grep 6 | wc -l | grep 2',
-                f'{pod_logs} | grep 7 | wc -l | grep 2',
-                f'{pod_logs} | grep 8 | wc -l | grep 2',
-                f'{pod_logs} | grep 9 | wc -l | grep 2',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-# ---------- Task: n=2 nodes with setups. ----------
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.skip(
-    reason=
-    'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.')
-def test_distributed_tf(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'resnet_distributed_tf_app',
-        [
-            # NOTE: running it twice will hang (sometimes?) - an app-level bug.
-            f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,  # 25 mins (it takes around ~19 mins)
-    )
-    run_one_test(test)
-
-
-# ---------- Testing GCP start and stop instances ----------
-@pytest.mark.gcp
-def test_gcp_start_stop():
-    name = _get_cluster_name()
-    test = Test(
-        'gcp-start-stop',
-        [
-            f'sky launch -y -c {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=40),
-            f'sky start -y {name} -i 1',
-            f'sky exec {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
-                timeout=200),
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Testing Azure start and stop instances ----------
-@pytest.mark.azure
-def test_azure_start_stop():
-    name = _get_cluster_name()
-    test = Test(
-        'azure-start-stop',
-        [
-            f'sky launch -y -c {name} examples/azure_start_stop.yaml',
-            f'sky exec {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f'sky start -y {name} -i 1',
-            f'sky exec {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
-                timeout=280) +
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # 30 mins
-    )
-    run_one_test(test)
-
-
-# ---------- Testing Autostopping ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
-@pytest.mark.no_ibm  # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_kubernetes  # Kubernetes does not autostop yet
-def test_autostop(generic_cloud: str):
-    name = _get_cluster_name()
-    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
-    # the VM is stopped.
-    autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    # Launching and starting Azure clusters can take a long time too. e.g., restart
-    # a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
-    total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
-    test = Test(
-        'autostop',
-        [
-            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} -i 1',
-
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m"',
-
-            # Ensure the cluster is not stopped early.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-
-            # Ensure the cluster is STOPPED.
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=autostop_timeout),
-
-            # Ensure the cluster is UP and the autostop setting is reset ('-').
-            f'sky start -y {name}',
-            f'sky status | grep {name} | grep -E "UP\s+-"',
-
-            # Ensure the job succeeded.
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-
-            # Test restarting the idleness timer via reset:
-            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
-            'sleep 40',  # Almost reached the threshold.
-            f'sky autostop -y {name} -i 1',  # Should restart the timer.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=autostop_timeout),
-
-            # Test restarting the idleness timer via exec:
-            f'sky start -y {name}',
-            f'sky status | grep {name} | grep -E "UP\s+-"',
-            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
-            'sleep 45',  # Almost reached the threshold.
-            f'sky exec {name} echo hi',  # Should restart the timer.
-            'sleep 45',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=autostop_timeout + _BUMP_UP_SECONDS),
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- Testing Autodowning ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
-def test_autodown(generic_cloud: str):
-    name = _get_cluster_name()
-    # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
-    # the VM is terminated.
-    autodown_timeout = 900 if generic_cloud == 'azure' else 240
-    total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
-    test = Test(
-        'autodown',
-        [
-            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --down -i 1',
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m (down)"',
-            # Ensure the cluster is not terminated early.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            # Ensure the cluster is terminated.
-            f'sleep {autodown_timeout}',
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
-            f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep "1m (down)"',
-            f'sleep {autodown_timeout}',
-            # Ensure the cluster is terminated.
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --cancel',
-            f'sleep {autodown_timeout}',
-            # Ensure the cluster is still UP.
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_autodown():
-    name = _get_cluster_name()
-    test = Test(
-        'SCP_autodown',
-        [
-            f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --down -i 1',
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m (down)"',
-            # Ensure the cluster is not terminated early.
-            'sleep 45',
-            f'sky status --refresh | grep {name} | grep UP',
-            # Ensure the cluster is terminated.
-            'sleep 200',
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
-            f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep "1m (down)"',
-            'sleep 200',
-            # Ensure the cluster is terminated.
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --cancel',
-            'sleep 200',
-            # Ensure the cluster is still UP.
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60):
-    test = Test(
-        f'{cloud}-cancel-task',
-        [
-            f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d',
-            # Wait the GPU process to start.
-            'sleep 60',
-            f'sky exec {name} "nvidia-smi | grep python"',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky cancel -y {name} 1',
-            'sleep 60',
-            # check if the python job is gone.
-            f'sky exec {name} "! nvidia-smi | grep python"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=timeout,
-    )
-    return test
-
-
-# ---------- Testing `sky cancel` ----------
-@pytest.mark.aws
-def test_cancel_aws():
-    name = _get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'aws')
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_cancel_gcp():
-    name = _get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'gcp')
-    run_one_test(test)
-
-
-@pytest.mark.azure
-def test_cancel_azure():
-    name = _get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60)
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support V100 gpus for now
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_paperspace  # Paperspace has `gnome-shell` on nvidia-smi
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_cancel_pytorch(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'cancel-pytorch',
-        [
-            f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d',
-            # Wait the GPU process to start.
-            'sleep 90',
-            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || '
-            # When run inside container/k8s, nvidia-smi cannot show process ids.
-            # See https://github.com/NVIDIA/nvidia-docker/issues/179
-            # To work around, we check if GPU utilization is greater than 0.
-            f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky cancel -y {name} 1',
-            'sleep 60',
-            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || '
-            # Ensure Xorg is the only process running.
-            '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi`
-# requires a CUDA public image, which IBM doesn't offer
-@pytest.mark.ibm
-def test_cancel_ibm():
-    name = _get_cluster_name()
-    test = Test(
-        'ibm-cancel-task',
-        [
-            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml',
-            f'sky exec {name} -n {name}-1 -d  "while true; do echo \'Hello SkyPilot\'; sleep 2; done"',
-            'sleep 20',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky cancel -y {name} 2',
-            f'sleep 5',
-            f'sky queue {name} | grep {name}-1 | grep CANCELLED',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Testing use-spot option ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-def test_use_spot(generic_cloud: str):
-    """Test use-spot and sky exec."""
-    name = _get_cluster_name()
-    test = Test(
-        'use-spot',
-        [
-            f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} echo hi',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_stop_gcp_spot():
-    """Test GCP spot can be stopped, autostopped, restarted."""
-    name = _get_cluster_name()
-    test = Test(
-        'stop_gcp_spot',
-        [
-            f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile',
-            # stop should go through:
-            f'sky stop {name} -y',
-            f'sky start {name} -y',
-            f'sky exec {name} -- ls myfile',
-            f'sky logs {name} 2 --status',
-            f'sky autostop {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=90),
-            f'sky start {name} -y',
-            f'sky exec {name} -- ls myfile',
-            f'sky logs {name} 3 --status',
-            # -i option at launch should go through:
-            f'sky launch -c {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
-                timeout=120),
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Testing managed job ----------
-# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues
-# when the controller being on Azure, which takes a long time for launching
-# step.
-@pytest.mark.managed_jobs
-def test_managed_jobs(generic_cloud: str):
-    """Test the managed jobs yaml."""
-    name = _get_cluster_name()
-    test = Test(
-        'managed-jobs',
-        [
-            f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-1',
-                job_status=
-                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
-                timeout=60),
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
-                job_status=
-                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
-                timeout=60),
-            f'sky jobs cancel -y -n {name}-1',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-1',
-                job_status=f'{ManagedJobStatus.CANCELLED.value}',
-                timeout=230),
-            # Test the functionality for logging.
-            f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
-            f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
-        ],
-        # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when
-        # canceling multiple job names is supported.
-        f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_job_pipeline(generic_cloud: str):
-    """Test a job pipeline."""
-    name = _get_cluster_name()
-    test = Test(
-        'spot-pipeline',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
-            # `grep -A 4 {name}` finds the job with {name} and the 4 lines
-            # after it, i.e. the 4 tasks within the job.
-            # `sed -n 2p` gets the second line of the 4 lines, i.e. the first
-            # task within the job.
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
-            f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_failed_setup(generic_cloud: str):
-    """Test managed job with failed setup."""
-    name = _get_cluster_name()
-    test = Test(
-        'managed_jobs_failed_setup',
-        [
-            f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
-            # Make sure the job failed quickly.
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
-                timeout=330 + _BUMP_UP_SECONDS),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
-    """Test managed job with failed setup for a pipeline."""
-    name = _get_cluster_name()
-    test = Test(
-        'managed_jobs_pipeline_failed_setup',
-        [
-            f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
-                timeout=600),
-            # Make sure the job failed quickly.
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
-            # Task 0 should be SUCCEEDED.
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
-            # Task 1 should be FAILED_SETUP.
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
-            # Task 2 should be CANCELLED.
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            # Task 3 should be CANCELLED.
-            f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- Testing managed job recovery ----------
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_aws(aws_config_region):
-    """Test managed job recovery."""
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = Test(
-        'managed_jobs_recovery_aws',
-        [
-            f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=600),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the cluster manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_gcp():
-    """Test managed job recovery."""
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-east4-b'
-    query_cmd = (
-        f'gcloud compute instances list --filter='
-        # `:` means prefix match.
-        f'"(labels.ray-cluster-name:{name_on_cloud})" '
-        f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = Test(
-        'managed_jobs_recovery_gcp',
-        [
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=300),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the cluster manually.
-            terminate_cmd,
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
-    """Test managed job recovery for a pipeline."""
-    name = _get_cluster_name()
-    user_hash = common_utils.get_user_hash()
-    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
-    region = aws_config_region
-    if region != 'us-east-2':
-        pytest.skip('Only run spot pipeline recovery test in us-east-2')
-    test = Test(
-        'managed_jobs_pipeline_recovery_aws',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
-            # Terminate the cluster manually.
-            # The `cat ...| rev` is to retrieve the job_id from the
-            # SKYPILOT_TASK_ID, which gets the second to last field
-            # separated by `-`.
-            (
-                f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
-                'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
-                f'aws ec2 terminate-instances --region {region} --instance-ids $('
-                f'aws ec2 describe-instances --region {region} '
-                # TODO(zhwu): fix the name for spot cluster.
-                '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}'
-                f'-{user_hash} '
-                f'--query Reservations[].Instances[].InstanceId '
-                '--output text)'),
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
-            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
-            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_recovery_gcp():
-    """Test managed job recovery for a pipeline."""
-    name = _get_cluster_name()
-    zone = 'us-east4-b'
-    user_hash = common_utils.get_user_hash()
-    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
-    query_cmd = (
-        'gcloud compute instances list --filter='
-        f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" '
-        f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = Test(
-        'managed_jobs_pipeline_recovery_gcp',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
-            # Terminate the cluster manually.
-            # The `cat ...| rev` is to retrieve the job_id from the
-            # SKYPILOT_TASK_ID, which gets the second to last field
-            # separated by `-`.
-            (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
-             f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
-            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
-            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_default_resources(generic_cloud: str):
-    """Test managed job recovery for default resources."""
-    name = _get_cluster_name()
-    test = Test(
-        'managed-spot-recovery-default-resources',
-        [
-            f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=
-                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})',
-                timeout=360),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
-    """Test managed job recovery."""
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = Test(
-        'managed_jobs_recovery_multi_node_aws',
-        [
-            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=450),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the worker manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
-             'Name=tag:ray-node-type,Values=worker '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=560),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_multi_node_gcp():
-    """Test managed job recovery."""
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-west2-a'
-    # Use ':' to match as the cluster name will contain the suffix with job id
-    query_cmd = (
-        f'gcloud compute instances list --filter='
-        f'"(labels.ray-cluster-name:{name_on_cloud} AND '
-        f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = Test(
-        'managed_jobs_recovery_multi_node_gcp',
-        [
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the worker manually.
-            terminate_cmd,
-            _JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=560),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_cancellation_aws(aws_config_region):
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    name_2_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = Test(
-        'managed_jobs_cancellation_aws',
-        [
-            # Test cancellation during spot cluster being launched.
-            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=
-                f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})',
-                timeout=60 + _BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
-            ),
-            # Test cancelling the spot cluster during spot job being setup.
-            f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            # The job is set up in the cluster, will shown as RUNNING.
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=300 + _BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}-2',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
-            ),
-            # Test cancellation during spot job is recovering.
-            f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            # The job is running in the cluster, will shown as RUNNING.
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-3',
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=300 + _BUMP_UP_SECONDS),
-            # Terminate the cluster manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
-            f'sky jobs cancel -y -n {name}-3',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-3',
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
-            # there can be multiple VM with the same name due to the recovery.
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"'
-            ),
-        ],
-        timeout=25 * 60)
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_cancellation_gcp():
-    name = _get_cluster_name()
-    name_3 = f'{name}-3'
-    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-west3-b'
-    query_state_cmd = (
-        'gcloud compute instances list '
-        f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" '
-        '--format="value(status)"')
-    query_cmd = (f'gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name:{name_3_on_cloud})" '
-                 f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = Test(
-        'managed_jobs_cancellation_gcp',
-        [
-            # Test cancellation during spot cluster being launched.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.STARTING.value,
-                timeout=60 + _BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            # Test cancelling the spot cluster during spot job being setup.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            # The job is set up in the cluster, will shown as RUNNING.
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=300 + _BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}-2',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-2',
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            # Test cancellation during spot job is recovering.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-3',
-                job_status=ManagedJobStatus.RUNNING.value,
-                timeout=300 + _BUMP_UP_SECONDS),
-            # Terminate the cluster manually.
-            terminate_cmd,
-            _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
-            f'sky jobs cancel -y -n {name}-3',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=f'{name}-3',
-                job_status=ManagedJobStatus.CANCELLED.value,
-                timeout=120 + _BUMP_UP_SECONDS),
-            # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
-            # there can be multiple VM with the same name due to the recovery.
-            (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
-            ),
-        ],
-        timeout=25 * 60)
-    run_one_test(test)
-
-
-# ---------- Testing storage for managed job ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_storage(generic_cloud: str):
-    """Test storage with managed job"""
-    name = _get_cluster_name()
-    yaml_str = pathlib.Path(
-        'examples/managed_job_with_storage.yaml').read_text()
-    timestamp = int(time.time())
-    storage_name = f'sky-test-{timestamp}'
-    output_storage_name = f'sky-test-output-{timestamp}'
-
-    # Also perform region testing for bucket creation to validate if buckets are
-    # created in the correct region and correctly mounted in managed jobs.
-    # However, we inject this testing only for AWS and GCP since they are the
-    # supported object storage providers in SkyPilot.
-    region_flag = ''
-    region_validation_cmd = 'true'
-    use_spot = ' --use-spot'
-    if generic_cloud == 'aws':
-        region = 'eu-central-1'
-        region_flag = f' --region {region}'
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.S3, bucket_name=storage_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
-        output_check_cmd = f'{s3_check_file_count} | grep 1'
-    elif generic_cloud == 'gcp':
-        region = 'us-west2'
-        region_flag = f' --region {region}'
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.GCS, bucket_name=storage_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
-        output_check_cmd = f'{gcs_check_file_count} | grep 1'
-    elif generic_cloud == 'azure':
-        region = 'westus2'
-        region_flag = f' --region {region}'
-        storage_account_name = (
-            storage_lib.AzureBlobStore.get_default_storage_account_name(region))
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.AZURE,
-            storage_account_name=storage_account_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.AZURE,
-            output_storage_name,
-            'output.txt',
-            storage_account_name=storage_account_name)
-        output_check_cmd = f'{az_check_file_count} | grep 1'
-    elif generic_cloud == 'kubernetes':
-        # With Kubernetes, we don't know which object storage provider is used.
-        # Check both S3 and GCS if bucket exists in either.
-        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
-        s3_output_check_cmd = f'{s3_check_file_count} | grep 1'
-        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
-        gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1'
-        output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}'
-        use_spot = ' --no-use-spot'
-
-    yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name)
-    yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(yaml_str)
-        f.flush()
-        file_path = f.name
-        test = Test(
-            'managed_jobs_storage',
-            [
-                *STORAGE_SETUP_COMMANDS,
-                f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
-                region_validation_cmd,  # Check if the bucket is created in the correct region
-                _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.
-                format(job_name=name,
-                       job_status=ManagedJobStatus.SUCCEEDED.value,
-                       timeout=60 + _BUMP_UP_SECONDS),
-                f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
-                # Check if file was written to the mounted output bucket
-                output_check_cmd
-            ],
-            (f'sky jobs cancel -y -n {name}',
-             f'; sky storage delete {output_storage_name} || true'),
-            # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-            timeout=20 * 60,
-        )
-        run_one_test(test)
-
-
-# ---------- Testing spot TPU ----------
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-@pytest.mark.tpu
-def test_managed_jobs_tpu():
-    """Test managed job on TPU."""
-    name = _get_cluster_name()
-    test = Test(
-        'test-spot-tpu',
-        [
-            f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.STARTING.value,
-                timeout=60 + _BUMP_UP_SECONDS),
-            # TPU takes a while to launch
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=
-                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})',
-                timeout=900 + _BUMP_UP_SECONDS),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- Testing env for managed jobs ----------
-@pytest.mark.managed_jobs
-def test_managed_jobs_inline_env(generic_cloud: str):
-    """Test managed jobs env"""
-    name = _get_cluster_name()
-    test = Test(
-        'test-managed-jobs-inline-env',
-        [
-            f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
-                job_name=name,
-                job_status=ManagedJobStatus.SUCCEEDED.value,
-                timeout=20 + _BUMP_UP_SECONDS),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# ---------- Testing env ----------
-def test_inline_env(generic_cloud: str):
-    """Test env"""
-    name = _get_cluster_name()
-    test = Test(
-        'test-inline-env',
-        [
-            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            'sleep 20',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-        _get_timeout(generic_cloud),
-    )
-    run_one_test(test)
-
-
-# ---------- Testing env file ----------
-def test_inline_env_file(generic_cloud: str):
-    """Test env"""
-    name = _get_cluster_name()
-    test = Test(
-        'test-inline-env-file',
-        [
-            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-        _get_timeout(generic_cloud),
-    )
-    run_one_test(test)
-
-
-# ---------- Testing custom image ----------
-@pytest.mark.aws
-def test_aws_custom_image():
-    """Test AWS custom image"""
-    name = _get_cluster_name()
-    test = Test(
-        'test-aws-custom-image',
-        [
-            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a',  # Nvidia image
-            f'sky logs {name} 1 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.kubernetes
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test latest image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-    ])
-def test_kubernetes_custom_image(image_id):
-    """Test Kubernetes custom image"""
-    name = _get_cluster_name()
-    test = Test(
-        'test-kubernetes-custom-image',
-        [
-            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1',
-            f'sky logs {name} 1 --status',
-            # Try exec to run again and check if the logs are printed
-            f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"',
-            # Make sure ssh is working with custom username
-            f'ssh {name} echo hi | grep hi',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_start_stop_two_nodes():
-    name = _get_cluster_name()
-    test = Test(
-        'azure-start-stop-two-nodes',
-        [
-            f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml',
-            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f'sky start -y {name} -i 1',
-            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-                cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})',
-                timeout=200 + _BUMP_UP_SECONDS) +
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # 30 mins  (it takes around ~23 mins)
-    )
-    run_one_test(test)
-
-
-# ---------- Testing env for disk tier ----------
-@pytest.mark.aws
-def test_aws_disk_tier():
-
-    def _get_aws_query_command(region, instance_id, field, expected):
-        return (f'aws ec2 describe-volumes --region {region} '
-                f'--filters Name=attachment.instance-id,Values={instance_id} '
-                f'--query Volumes[*].{field} | grep {expected} ; ')
-
-    for disk_tier in list(resources_utils.DiskTier):
-        specs = AWS._get_disk_specs(disk_tier)
-        name = _get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.AWS.max_cluster_name_length())
-        region = 'us-east-2'
-        test = Test(
-            'aws-disk-tier-' + disk_tier.value,
-            [
-                f'sky launch -y -c {name} --cloud aws --region {region} '
-                f'--disk-tier {disk_tier.value} echo "hello sky"',
-                f'id=`aws ec2 describe-instances --region {region} --filters '
-                f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query '
-                f'Reservations[].Instances[].InstanceId --output text`; ' +
-                _get_aws_query_command(region, '$id', 'VolumeType',
-                                       specs['disk_tier']) +
-                ('' if specs['disk_tier']
-                 == 'standard' else _get_aws_query_command(
-                     region, '$id', 'Iops', specs['disk_iops'])) +
-                ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command(
-                    region, '$id', 'Throughput', specs['disk_throughput'])),
-            ],
-            f'sky down -y {name}',
-            timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
-        )
-        run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_disk_tier():
-    for disk_tier in list(resources_utils.DiskTier):
-        disk_types = [GCP._get_disk_type(disk_tier)]
-        name = _get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.GCP.max_cluster_name_length())
-        region = 'us-west2'
-        instance_type_options = ['']
-        if disk_tier == resources_utils.DiskTier.BEST:
-            # Ultra disk tier requires n2 instance types to have more than 64 CPUs.
-            # If using default instance type, it will only enable the high disk tier.
-            disk_types = [
-                GCP._get_disk_type(resources_utils.DiskTier.HIGH),
-                GCP._get_disk_type(resources_utils.DiskTier.ULTRA),
-            ]
-            instance_type_options = ['', '--instance-type n2-standard-64']
-        for disk_type, instance_type_option in zip(disk_types,
-                                                   instance_type_options):
-            test = Test(
-                'gcp-disk-tier-' + disk_tier.value,
-                [
-                    f'sky launch -y -c {name} --cloud gcp --region {region} '
-                    f'--disk-tier {disk_tier.value} {instance_type_option} ',
-                    f'name=`gcloud compute instances list --filter='
-                    f'"labels.ray-cluster-name:{name_on_cloud}" '
-                    '--format="value(name)"`; '
-                    f'gcloud compute disks list --filter="name=$name" '
-                    f'--format="value(type)" | grep {disk_type} '
-                ],
-                f'sky down -y {name}',
-                timeout=6 * 60,  # 6 mins  (it takes around ~3 mins)
-            )
-            run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_disk_tier():
-    for disk_tier in list(resources_utils.DiskTier):
-        if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
-            # Azure does not support high and ultra disk tier.
-            continue
-        type = Azure._get_disk_type(disk_tier)
-        name = _get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.Azure.max_cluster_name_length())
-        region = 'westus2'
-        test = Test(
-            'azure-disk-tier-' + disk_tier.value,
-            [
-                f'sky launch -y -c {name} --cloud azure --region {region} '
-                f'--disk-tier {disk_tier.value} echo "hello sky"',
-                f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
-                f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
-                f'--output tsv | grep {type}'
-            ],
-            f'sky down -y {name}',
-            timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
-        )
-        run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_best_tier_failover():
-    type = Azure._get_disk_type(resources_utils.DiskTier.LOW)
-    name = _get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.Azure.max_cluster_name_length())
-    region = 'westus2'
-    test = Test(
-        'azure-best-tier-failover',
-        [
-            f'sky launch -y -c {name} --cloud azure --region {region} '
-            f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"',
-            f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
-            f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
-            f'--output tsv | grep {type}',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
-    )
-    run_one_test(test)
-
-
-# ------ Testing Zero Quota Failover ------
-@pytest.mark.aws
-def test_aws_zero_quota_failover():
-
-    name = _get_cluster_name()
-    region = get_aws_region_for_quota_failover()
-
-    if not region:
-        pytest.xfail(
-            'Unable to test zero quota failover optimization — quotas '
-            'for EC2 P3 instances were found on all AWS regions. Is this '
-            'expected for your account?')
-        return
-
-    test = Test(
-        'aws-zero-quota-failover',
-        [
-            f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_zero_quota_failover():
-
-    name = _get_cluster_name()
-    region = get_gcp_region_for_quota_failover()
-
-    if not region:
-        pytest.xfail(
-            'Unable to test zero quota failover optimization — quotas '
-            'for A100-80GB GPUs were found on all GCP regions. Is this '
-            'expected for your account?')
-        return
-
-    test = Test(
-        'gcp-zero-quota-failover',
-        [
-            f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-def test_long_setup_run_script(generic_cloud: str):
-    name = _get_cluster_name()
-    with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
-                                     suffix='.yaml') as f:
-        f.write(
-            textwrap.dedent(""" \
-            setup: |
-              echo "start long setup"
-            """))
-        for i in range(1024 * 200):
-            f.write(f'  echo {i}\n')
-        f.write('  echo "end long setup"\n')
-        f.write(
-            textwrap.dedent(""" \
-            run: |
-              echo "run"
-        """))
-        for i in range(1024 * 200):
-            f.write(f'  echo {i}\n')
-        f.write('  echo "end run"\n')
-        f.flush()
-
-        test = Test(
-            'long-setup-run-script',
-            [
-                f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}',
-                f'sky exec {name} "echo hello"',
-                f'sky exec {name} {f.name}',
-                f'sky logs {name} --status 1',
-                f'sky logs {name} --status 2',
-                f'sky logs {name} --status 3',
-            ],
-            f'sky down -y {name}',
-        )
-        run_one_test(test)
-
-
-# ---------- Testing skyserve ----------
-
-
-def _get_service_name() -> str:
-    """Returns a user-unique service name for each test_skyserve_<name>().
-
-    Must be called from each test_skyserve_<name>().
-    """
-    caller_func_name = inspect.stack()[1][3]
-    test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
-    test_name = test_name.replace('skyserve-', 'ss-')
-    test_name = common_utils.make_cluster_name_on_cloud(test_name, 24)
-    return f'{test_name}-{test_id}'
-
-
-# We check the output of the skyserve service to see if it is ready. Output of
-# `REPLICAS` is in the form of `1/2` where the first number is the number of
-# ready replicas and the second number is the number of total replicas. We
-# grep such format to ensure that the service is ready, and early exit if any
-# failure detected. In the end we sleep for
-# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have
-# enough time to sync with the controller and get all ready replica IPs.
-_SERVE_WAIT_UNTIL_READY = (
-    '{{ while true; do'
-    '     s=$(sky serve status {name}); echo "$s";'
-    '     echo "$s" | grep -q "{replica_num}/{replica_num}" && break;'
-    '     echo "$s" | grep -q "FAILED" && exit 1;'
-    '     sleep 10;'
-    ' done; }}; echo "Got service status $s";'
-    f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};')
-_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}'
-_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag'
-_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING'
-# Since we don't allow terminate the service if the controller is INIT,
-# which is common for simultaneous pytest, we need to wait until the
-# controller is UP before we can terminate the service.
-# The teardown command has a 10-mins timeout, so we don't need to do
-# the timeout here. See implementation of run_one_test() for details.
-_TEARDOWN_SERVICE = (
-    '(for i in `seq 1 20`; do'
-    '     s=$(sky serve down -y {name});'
-    '     echo "Trying to terminate {name}";'
-    '     echo "$s";'
-    '     echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;'
-    '     sleep 10;'
-    '     [ $i -eq 20 ] && echo "Failed to terminate service {name}";'
-    'done)')
-
-_SERVE_ENDPOINT_WAIT = (
-    'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
-    'endpoint=$(sky serve status --endpoint {name}); '
-    'until ! echo "$endpoint" | grep "Controller is initializing"; '
-    'do echo "Waiting for serve endpoint to be ready..."; '
-    'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; '
-    'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"')
-
-_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); '
-                      'until ! echo "$s" | grep "Controller is initializing."; '
-                      'do echo "Waiting for serve status to be ready..."; '
-                      'sleep 5; s=$(sky serve status {name}); done; echo "$s"')
-
-
-def _get_replica_ip(name: str, replica_id: int) -> str:
-    return (f'ip{replica_id}=$(echo "$s" | '
-            f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | '
-            f'grep -E "{name}\s+{replica_id}" | '
-            f'grep -Eo "{_IP_REGEX}")')
-
-
-def _get_skyserve_http_test(name: str, cloud: str,
-                            timeout_minutes: int) -> Test:
-    test = Test(
-        f'test-skyserve-{cloud.replace("_", "-")}',
-        [
-            f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=timeout_minutes * 60,
-    )
-    return test
-
-
-def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool,
-                                                                 str]]) -> str:
-    """Check replicas' status and count in sky serve status
-
-    We will check vCPU=2, as all our tests use vCPU=2.
-
-    Args:
-        name: the name of the service
-        check_tuples: A list of replica property to check. Each tuple is
-            (count, is_spot, status)
-    """
-    check_cmd = ''
-    for check_tuple in check_tuples:
-        count, is_spot, status = check_tuple
-        resource_str = ''
-        if status not in ['PENDING', 'SHUTTING_DOWN'
-                         ] and not status.startswith('FAILED'):
-            spot_str = ''
-            if is_spot:
-                spot_str = '\[Spot\]'
-            resource_str = f'({spot_str}vCPU=2)'
-        check_cmd += (f' echo "$s" | grep "{resource_str}" | '
-                      f'grep "{status}" | wc -l | grep {count} || exit 1;')
-    return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd)
-
-
-def _check_service_version(service_name: str, version: str) -> str:
-    # Grep the lines before 'Service Replicas' and check if the service version
-    # is correct.
-    return (f'echo "$s" | grep -B1000 "Service Replicas" | '
-            f'grep -E "{service_name}\s+{version}" || exit 1; ')
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_gcp_http():
-    """Test skyserve on GCP"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'gcp', 20)
-    run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.serve
-def test_skyserve_aws_http():
-    """Test skyserve on AWS"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'aws', 20)
-    run_one_test(test)
-
-
-@pytest.mark.azure
-@pytest.mark.serve
-def test_skyserve_azure_http():
-    """Test skyserve on Azure"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'azure', 30)
-    run_one_test(test)
-
-
-@pytest.mark.kubernetes
-@pytest.mark.serve
-def test_skyserve_kubernetes_http():
-    """Test skyserve on Kubernetes"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'kubernetes', 30)
-    run_one_test(test)
-
-
-@pytest.mark.oci
-@pytest.mark.serve
-def test_skyserve_oci_http():
-    """Test skyserve on OCI"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'oci', 20)
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support T4 gpus for now
-@pytest.mark.serve
-def test_skyserve_llm(generic_cloud: str):
-    """Test skyserve with real LLM usecase"""
-    name = _get_service_name()
-
-    def generate_llm_test_command(prompt: str, expected_output: str) -> str:
-        prompt = shlex.quote(prompt)
-        expected_output = shlex.quote(expected_output)
-        return (
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'python tests/skyserve/llm/get_response.py --endpoint $endpoint '
-            f'--prompt {prompt} | grep {expected_output}')
-
-    with open('tests/skyserve/llm/prompt_output.json', 'r',
-              encoding='utf-8') as f:
-        prompt2output = json.load(f)
-
-    test = Test(
-        f'test-skyserve-llm',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            *[
-                generate_llm_test_command(prompt, output)
-                for prompt, output in prompt2output.items()
-            ],
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=40 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_spot_recovery():
-    name = _get_service_name()
-    zone = 'us-central1-a'
-
-    test = Test(
-        f'test-skyserve-spot-recovery-gcp',
-        [
-            f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-            _terminate_gcp_replica(name, zone, 1),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.serve
-@pytest.mark.no_kubernetes
-def test_skyserve_base_ondemand_fallback(generic_cloud: str):
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-base-ondemand-fallback',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(1, True, 'READY'),
-                                            (1, False, 'READY')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_dynamic_ondemand_fallback():
-    name = _get_service_name()
-    zone = 'us-central1-a'
-
-    test = Test(
-        f'test-skyserve-dynamic-ondemand-fallback',
-        [
-            f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml',
-            f'sleep 40',
-            # 2 on-demand (provisioning) + 2 Spot (provisioning).
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";'
-            'echo "$s" | grep -q "0/4" || exit 1',
-            # Wait for the provisioning starts
-            f'sleep 40',
-            _check_replica_in_status(name, [
-                (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
-                (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN')
-            ]),
-
-            # Wait until 2 spot instances are ready.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(2, True, 'READY'),
-                                            (0, False, '')]),
-            _terminate_gcp_replica(name, zone, 1),
-            f'sleep 40',
-            # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning).
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
-            'echo "$s" | grep -q "1/3"',
-            _check_replica_in_status(
-                name, [(1, True, 'READY'),
-                       (1, True, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
-
-            # Wait until 2 spot instances are ready.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(2, True, 'READY'),
-                                            (0, False, '')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_user_bug_restart(generic_cloud: str):
-    """Tests that we restart the service after user bug."""
-    # TODO(zhwu): this behavior needs some rethinking.
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-user-bug-restart',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml',
-            f's=$(sky serve status {name}); echo "$s";'
-            'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; '
-            'do echo "Waiting for first service to be SHUTTING DOWN..."; '
-            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ',
-            f's=$(sky serve status {name}); echo "$s";'
-            'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; '
-            'do echo "Waiting for first service to be FAILED..."; '
-            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; '
-            + _check_replica_in_status(name, [(1, True, 'FAILED')]) +
-            # User bug failure will cause no further scaling.
-            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; '
-            f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; '
-            + _check_replica_in_status(name, [(1, False, 'READY'),
-                                              (1, False, 'FAILED')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-@pytest.mark.no_kubernetes  # Replicas on k8s may be running on the same node and have the same public IP
-def test_skyserve_load_balancer(generic_cloud: str):
-    """Test skyserve load balancer round-robin policy"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-load-balancer',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
-            f'{_get_replica_ip(name, 1)}; '
-            f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; '
-            'python tests/skyserve/load_balancer/test_round_robin.py '
-            '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-@pytest.mark.no_kubernetes
-def test_skyserve_auto_restart():
-    """Test skyserve with auto restart"""
-    name = _get_service_name()
-    zone = 'us-central1-a'
-    test = Test(
-        f'test-skyserve-auto-restart',
-        [
-            # TODO(tian): we can dynamically generate YAML from template to
-            # avoid maintaining too many YAML files
-            f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-            # sleep for 20 seconds (initial delay) to make sure it will
-            # be restarted
-            f'sleep 20',
-            _terminate_gcp_replica(name, zone, 1),
-            # Wait for consecutive failure timeout passed.
-            # If the cluster is not using spot, it won't check the cluster status
-            # on the cloud (since manual shutdown is not a common behavior and such
-            # queries takes a lot of time). Instead, we think continuous 3 min probe
-            # failure is not a temporary problem but indeed a failure.
-            'sleep 180',
-            # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time
-            # that the output of `sky serve status` shows FAILED and this status will
-            # cause _SERVE_WAIT_UNTIL_READY to early quit.
-            '(while true; do'
-            f'    output=$(sky serve status {name});'
-            '     echo "$output" | grep -q "1/1" && break;'
-            '     sleep 10;'
-            f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_cancel(generic_cloud: str):
-    """Test skyserve with cancel"""
-    name = _get_service_name()
-
-    test = Test(
-        f'test-skyserve-cancel',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 '
-            'tests/skyserve/cancel/send_cancel_request.py '
-            '--endpoint $endpoint | grep "Request was cancelled"',
-            f's=$(sky serve logs {name} 1 --no-follow); '
-            'until ! echo "$s" | grep "Please wait for the controller to be"; '
-            'do echo "Waiting for serve logs"; sleep 10; '
-            f's=$(sky serve logs {name} 1 --no-follow); done; '
-            'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_streaming(generic_cloud: str):
-    """Test skyserve with streaming"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-streaming',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'python3 tests/skyserve/streaming/send_streaming_request.py '
-            '--endpoint $endpoint | grep "Streaming test passed"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_readiness_timeout_fail(generic_cloud: str):
-    """Test skyserve with large readiness probe latency, expected to fail"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-readiness-timeout-fail',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml',
-            # None of the readiness probe will pass, so the service will be
-            # terminated after the initial delay.
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            'sleep 60',
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;'
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_large_readiness_timeout(generic_cloud: str):
-    """Test skyserve with customized large readiness timeout"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-large-readiness-timeout',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_update(generic_cloud: str):
-    """Test skyserve with update"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-update',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;'
-            # Make sure the traffic is not mixed
-            'curl http://$endpoint | grep "Hi, new SkyPilot here"',
-            # The latest 2 version should be READY and the older versions should be shutting down
-            (_check_replica_in_status(name, [(2, False, 'READY'),
-                                             (2, False, 'SHUTTING_DOWN')]) +
-             _check_service_version(name, "2")),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_rolling_update(generic_cloud: str):
-    """Test skyserve with rolling update"""
-    name = _get_service_name()
-    single_new_replica = _check_replica_in_status(
-        name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-               (1, False, 'SHUTTING_DOWN')])
-    test = Test(
-        f'test-skyserve-rolling-update',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml',
-            # Make sure the traffic is mixed across two versions, the replicas
-            # with even id will sleep 60 seconds before being ready, so we
-            # should be able to get observe the period that the traffic is mixed
-            # across two versions.
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; '
-            # The latest version should have one READY and the one of the older versions should be shutting down
-            f'{single_new_replica} {_check_service_version(name, "1,2")} '
-            # Check the output from the old version, immediately after the
-            # output from the new version appears. This is guaranteed by the
-            # round robin load balancing policy.
-            # TODO(zhwu): we should have a more generalized way for checking the
-            # mixed version of replicas to avoid depending on the specific
-            # round robin load balancing policy.
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_fast_update(generic_cloud: str):
-    """Test skyserve with fast update (Increment version of old replicas)"""
-    name = _get_service_name()
-
-    test = Test(
-        f'test-skyserve-fast-update',
-        [
-            f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml',
-            # sleep to wait for update to be registered.
-            'sleep 40',
-            # 2 on-deamnd (ready) + 1 on-demand (provisioning).
-            (
-                _check_replica_in_status(
-                    name, [(2, False, 'READY'),
-                           (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) +
-                # Fast update will directly have the latest version ready.
-                _check_service_version(name, "2")),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) +
-            _check_service_version(name, "2"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            # Test rolling update
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml',
-            # sleep to wait for update to be registered.
-            'sleep 25',
-            # 2 on-deamnd (ready) + 1 on-demand (shutting down).
-            _check_replica_in_status(name, [(2, False, 'READY'),
-                                            (1, False, 'SHUTTING_DOWN')]),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "3"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_update_autoscale(generic_cloud: str):
-    """Test skyserve update with autoscale"""
-    name = _get_service_name()
-    test = Test(
-        f'test-skyserve-update-autoscale',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "1"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            # Timeout will be triggered when update fails.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) +
-            _check_service_version(name, "2"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
-            # Rolling Update
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            # Timeout will be triggered when update fails.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "3"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=30 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Spot instances are note supported by Fluidstack
-@pytest.mark.serve
-@pytest.mark.no_kubernetes  # Spot instances are not supported in Kubernetes
-@pytest.mark.parametrize('mode', ['rolling', 'blue_green'])
-def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
-    """Test skyserve with update that changes autoscaler"""
-    name = f'{_get_service_name()}-{mode}'
-
-    wait_until_no_pending = (
-        f's=$(sky serve status {name}); echo "$s"; '
-        'until ! echo "$s" | grep PENDING; do '
-        '  echo "Waiting for replica to be out of pending..."; '
-        f' sleep 5; s=$(sky serve status {name}); '
-        '  echo "$s"; '
-        'done')
-    four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')])
-    update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;']
-    if mode == 'rolling':
-        # Check rolling update, it will terminate one of the old on-demand
-        # instances, once there are 4 spot instance ready.
-        update_check += [
-            _check_replica_in_status(
-                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) +
-            _check_service_version(name, "1,2"),
-        ]
-    else:
-        # Check blue green update, it will keep both old on-demand instances
-        # running, once there are 4 spot instance ready.
-        update_check += [
-            _check_replica_in_status(
-                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (2, False, 'READY')]) +
-            _check_service_version(name, "1"),
-        ]
-    test = Test(
-        f'test-skyserve-new-autoscaler-update-{mode}',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "1"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml',
-            # Wait for update to be registered
-            f'sleep 90',
-            wait_until_no_pending,
-            _check_replica_in_status(
-                name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (2, False, 'READY')]),
-            *update_check,
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-            _check_replica_in_status(name, [(4, True, 'READY'),
-                                            (1, False, 'READY')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_failures(generic_cloud: str):
-    """Test replica failure statuses"""
-    name = _get_service_name()
-
-    test = Test(
-        'test-skyserve-failures',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml',
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            'sleep 60',
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; '
-            # Make sure no new replicas are started for early failure.
-            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml',
-            f's=$(sky serve status {name}); '
-            # Wait for replica to be ready.
-            f'until echo "$s" | grep "READY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            # Wait for replica to change to FAILED_PROBING
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_PROBING"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done',
-            # Wait for the PENDING replica to appear.
-            'sleep 10',
-            # Wait until the replica is out of PENDING.
-            f's=$(sky serve status {name}); '
-            f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do '
-            'echo "Waiting for replica to be out of pending..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done; ' +
-            _check_replica_in_status(
-                name, [(1, False, 'FAILED_PROBING'),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
-            # TODO(zhwu): add test for FAILED_PROVISION
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-# TODO(Ziming, Tian): Add tests for autoscaling.
-
-
-# ------- Testing user dependencies --------
-def test_user_dependencies(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'user-dependencies',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} "echo hi"',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-            f'sky exec {name} "echo bye"',
-            f'sky logs {name} 3 --status',
-            f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml',
-            f'sky logs {name} 4 --status',
-            # Launch again to test the default env does not affect SkyPilot
-            # runtime setup
-            f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"',
-            f'sky logs {name} 5 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ------- Testing the core API --------
-# Most of the core APIs have been tested in the CLI tests.
-# These tests are for testing the return value of the APIs not fully used in CLI.
-
-
-@pytest.mark.gcp
-def test_core_api_sky_launch_exec():
-    name = _get_cluster_name()
-    task = sky.Task(run="whoami")
-    task.set_resources(sky.Resources(cloud=sky.GCP()))
-    job_id, handle = sky.launch(task, cluster_name=name)
-    assert job_id == 1
-    assert handle is not None
-    assert handle.cluster_name == name
-    assert handle.launched_resources.cloud.is_same_cloud(sky.GCP())
-    job_id_exec, handle_exec = sky.exec(task, cluster_name=name)
-    assert job_id_exec == 2
-    assert handle_exec is not None
-    assert handle_exec.cluster_name == name
-    assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP())
-    # For dummy task (i.e. task.run is None), the job won't be submitted.
-    dummy_task = sky.Task()
-    job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name)
-    assert job_id_dummy is None
-    sky.down(name)
-
-
-# The sky launch CLI has some additional checks to make sure the cluster is up/
-# restarted. However, the core API doesn't have these; make sure it still works
-def test_core_api_sky_launch_fast(generic_cloud: str):
-    name = _get_cluster_name()
-    cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)
-    try:
-        task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud))
-        sky.launch(task,
-                   cluster_name=name,
-                   idle_minutes_to_autostop=1,
-                   fast=True)
-        # Sleep to let the cluster autostop
-        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
-            cluster_name=name,
-            cluster_status=ClusterStatus.STOPPED,
-            timeout=120)
-        # Run it again - should work with fast=True
-        sky.launch(task,
-                   cluster_name=name,
-                   idle_minutes_to_autostop=1,
-                   fast=True)
-    finally:
-        sky.down(name)
-
-
-# ---------- Testing Storage ----------
-class TestStorageWithCredentials:
-    """Storage tests which require credentials and network connection"""
-
-    AWS_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        '192.168.5.4',  # formatted as an IP address
-        'xn--bucket',  # starts with 'xn--' prefix
-        'bucket-s3alias',  # ends with '-s3alias' suffix
-        'bucket--ol-s3',  # ends with '--ol-s3' suffix
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '-abc',  # starts with a hyphen
-        'abc-',  # ends with a hyphen
-    ]
-
-    GCS_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters (without dots)
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1'
-        # More than 63 characters between dots
-        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5,
-        # more than 222 characters (with dots)
-        '192.168.5.4',  # formatted as an IP address
-        'googbucket',  # starts with 'goog' prefix
-        'googlebucket',  # contains 'google'
-        'g00glebucket',  # variant of 'google'
-        'go0glebucket',  # variant of 'google'
-        'g0oglebucket',  # variant of 'google'
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '_abc',  # starts with an underscore
-        'abc_',  # ends with an underscore
-    ]
-
-    AZURE_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        # more than 63 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        'Abcdef',  # contains an uppercase letter
-        '.abc',  # starts with a non-letter(dot)
-        'a--bc',  # contains consecutive hyphens
-    ]
-
-    IBM_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        '192.168.5.4',  # formatted as an IP address
-        'xn--bucket',  # starts with 'xn--' prefix
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '-abc',  # starts with a hyphen
-        'abc-',  # ends with a hyphen
-        'a.-bc',  # contains the sequence '.-'
-        'a-.bc',  # contains the sequence '-.'
-        'a&bc'  # contains special characters
-        'ab^c'  # contains special characters
-    ]
-    GITIGNORE_SYNC_TEST_DIR_STRUCTURE = {
-        'double_asterisk': {
-            'double_asterisk_excluded': None,
-            'double_asterisk_excluded_dir': {
-                'dir_excluded': None,
-            },
-        },
-        'double_asterisk_parent': {
-            'parent': {
-                'also_excluded.txt': None,
-                'child': {
-                    'double_asterisk_parent_child_excluded.txt': None,
-                },
-                'double_asterisk_parent_excluded.txt': None,
-            },
-        },
-        'excluded.log': None,
-        'excluded_dir': {
-            'excluded.txt': None,
-            'nested_excluded': {
-                'excluded': None,
-            },
-        },
-        'exp-1': {
-            'be_excluded': None,
-        },
-        'exp-2': {
-            'be_excluded': None,
-        },
-        'front_slash_excluded': None,
-        'included.log': None,
-        'included.txt': None,
-        'include_dir': {
-            'excluded.log': None,
-            'included.log': None,
-        },
-        'nested_double_asterisk': {
-            'one': {
-                'also_exclude.txt': None,
-            },
-            'two': {
-                'also_exclude.txt': None,
-            },
-        },
-        'nested_wildcard_dir': {
-            'monday': {
-                'also_exclude.txt': None,
-            },
-            'tuesday': {
-                'also_exclude.txt': None,
-            },
-        },
-        'no_slash_excluded': None,
-        'no_slash_tests': {
-            'no_slash_excluded': {
-                'also_excluded.txt': None,
-            },
-        },
-        'question_mark': {
-            'excluded1.txt': None,
-            'excluded@.txt': None,
-        },
-        'square_bracket': {
-            'excluded1.txt': None,
-        },
-        'square_bracket_alpha': {
-            'excludedz.txt': None,
-        },
-        'square_bracket_excla': {
-            'excluded2.txt': None,
-            'excluded@.txt': None,
-        },
-        'square_bracket_single': {
-            'excluded0.txt': None,
-        },
-    }
-
-    @staticmethod
-    def create_dir_structure(base_path, structure):
-        # creates a given file STRUCTURE in BASE_PATH
-        for name, substructure in structure.items():
-            path = os.path.join(base_path, name)
-            if substructure is None:
-                # Create a file
-                open(path, 'a', encoding='utf-8').close()
-            else:
-                # Create a subdirectory
-                os.mkdir(path)
-                TestStorageWithCredentials.create_dir_structure(
-                    path, substructure)
-
-    @staticmethod
-    def cli_delete_cmd(store_type,
-                       bucket_name,
-                       storage_account_name: str = None):
-        if store_type == storage_lib.StoreType.S3:
-            url = f's3://{bucket_name}'
-            return f'aws s3 rb {url} --force'
-        if store_type == storage_lib.StoreType.GCS:
-            url = f'gs://{bucket_name}'
-            gsutil_alias, alias_gen = data_utils.get_gsutil_command()
-            return f'{alias_gen}; {gsutil_alias} rm -r {url}'
-        if store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            storage_account_name = (
-                storage_lib.AzureBlobStore.get_default_storage_account_name(
-                    default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage container delete '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} '
-                    f'--name {bucket_name}')
-        if store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            url = f's3://{bucket_name}'
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2'
-        if store_type == storage_lib.StoreType.IBM:
-            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-                bucket_name, Rclone.RcloneClouds.IBM)
-            return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}'
-
-    @staticmethod
-    def cli_ls_cmd(store_type, bucket_name, suffix=''):
-        if store_type == storage_lib.StoreType.S3:
-            if suffix:
-                url = f's3://{bucket_name}/{suffix}'
-            else:
-                url = f's3://{bucket_name}'
-            return f'aws s3 ls {url}'
-        if store_type == storage_lib.StoreType.GCS:
-            if suffix:
-                url = f'gs://{bucket_name}/{suffix}'
-            else:
-                url = f'gs://{bucket_name}'
-            return f'gsutil ls {url}'
-        if store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            config_storage_account = skypilot_config.get_nested(
-                ('azure', 'storage_account'), None)
-            storage_account_name = config_storage_account if (
-                config_storage_account is not None) else (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            list_cmd = ('az storage blob list '
-                        f'--container-name {bucket_name} '
-                        f'--prefix {shlex.quote(suffix)} '
-                        f'--account-name {storage_account_name} '
-                        f'--account-key {storage_account_key}')
-            return list_cmd
-        if store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            if suffix:
-                url = f's3://{bucket_name}/{suffix}'
-            else:
-                url = f's3://{bucket_name}'
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2'
-        if store_type == storage_lib.StoreType.IBM:
-            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-                bucket_name, Rclone.RcloneClouds.IBM)
-            return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}'
-
-    @staticmethod
-    def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None):
-        if store_type == storage_lib.StoreType.S3:
-            assert bucket_name is not None
-            return ('aws s3api get-bucket-location '
-                    f'--bucket {bucket_name} --output text')
-        elif store_type == storage_lib.StoreType.GCS:
-            assert bucket_name is not None
-            return (f'gsutil ls -L -b gs://{bucket_name}/ | '
-                    'grep "Location constraint" | '
-                    'awk \'{print tolower($NF)}\'')
-        elif store_type == storage_lib.StoreType.AZURE:
-            # For Azure Blob Storage, the location of the containers are
-            # determined by the location of storage accounts.
-            assert storage_account_name is not None
-            return (f'az storage account show --name {storage_account_name} '
-                    '--query "primaryLocation" --output tsv')
-        else:
-            raise NotImplementedError(f'Region command not implemented for '
-                                      f'{store_type}')
-
-    @staticmethod
-    def cli_count_name_in_bucket(store_type,
-                                 bucket_name,
-                                 file_name,
-                                 suffix='',
-                                 storage_account_name=None):
-        if store_type == storage_lib.StoreType.S3:
-            if suffix:
-                return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
-            else:
-                return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
-        elif store_type == storage_lib.StoreType.GCS:
-            if suffix:
-                return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l'
-            else:
-                return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l'
-        elif store_type == storage_lib.StoreType.AZURE:
-            if storage_account_name is None:
-                default_region = 'eastus'
-                storage_account_name = (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage blob list '
-                    f'--container-name {bucket_name} '
-                    f'--prefix {shlex.quote(suffix)} '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} | '
-                    f'grep {file_name} | '
-                    'wc -l')
-        elif store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            if suffix:
-                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
-            else:
-                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
-
-    @staticmethod
-    def cli_count_file_in_bucket(store_type, bucket_name):
-        if store_type == storage_lib.StoreType.S3:
-            return f'aws s3 ls s3://{bucket_name} --recursive | wc -l'
-        elif store_type == storage_lib.StoreType.GCS:
-            return f'gsutil ls -r gs://{bucket_name}/** | wc -l'
-        elif store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            storage_account_name = (
-                storage_lib.AzureBlobStore.get_default_storage_account_name(
-                    default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage blob list '
-                    f'--container-name {bucket_name} '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} | '
-                    'grep \\"name\\": | '
-                    'wc -l')
-        elif store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l'
-
-    @pytest.fixture
-    def tmp_source(self, tmp_path):
-        # Creates a temporary directory with a file in it
-        tmp_dir = tmp_path / 'tmp-source'
-        tmp_dir.mkdir()
-        tmp_file = tmp_dir / 'tmp-file'
-        tmp_file.write_text('test')
-        circle_link = tmp_dir / 'circle-link'
-        circle_link.symlink_to(tmp_dir, target_is_directory=True)
-        yield str(tmp_dir)
-
-    @staticmethod
-    def generate_bucket_name():
-        # Creates a temporary bucket name
-        # time.time() returns varying precision on different systems, so we
-        # replace the decimal point and use whatever precision we can get.
-        timestamp = str(time.time()).replace('.', '')
-        return f'sky-test-{timestamp}'
-
-    @pytest.fixture
-    def tmp_bucket_name(self):
-        yield self.generate_bucket_name()
-
-    @staticmethod
-    def yield_storage_object(
-            name: Optional[str] = None,
-            source: Optional[storage_lib.Path] = None,
-            stores: Optional[Dict[storage_lib.StoreType,
-                                  storage_lib.AbstractStore]] = None,
-            persistent: Optional[bool] = True,
-            mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT):
-        # Creates a temporary storage object. Stores must be added in the test.
-        storage_obj = storage_lib.Storage(name=name,
-                                          source=source,
-                                          stores=stores,
-                                          persistent=persistent,
-                                          mode=mode)
-        yield storage_obj
-        handle = global_user_state.get_handle_from_storage_name(
-            storage_obj.name)
-        if handle:
-            # If handle exists, delete manually
-            # TODO(romilb): This is potentially risky - if the delete method has
-            #   bugs, this can cause resource leaks. Ideally we should manually
-            #   eject storage from global_user_state and delete the bucket using
-            #   boto3 directly.
-            storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_scratch_storage_obj(self, tmp_bucket_name):
-        # Creates a storage object with no source to create a scratch storage.
-        # Stores must be added in the test.
-        yield from self.yield_storage_object(name=tmp_bucket_name)
-
-    @pytest.fixture
-    def tmp_multiple_scratch_storage_obj(self):
-        # Creates a list of 5 storage objects with no source to create
-        # multiple scratch storages.
-        # Stores for each object in the list must be added in the test.
-        storage_mult_obj = []
-        for _ in range(5):
-            timestamp = str(time.time()).replace('.', '')
-            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}')
-            storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                # If handle exists, delete manually
-                # TODO(romilb): This is potentially risky - if the delete method has
-                # bugs, this can cause resource leaks. Ideally we should manually
-                # eject storage from global_user_state and delete the bucket using
-                # boto3 directly.
-                storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_multiple_custom_source_storage_obj(self):
-        # Creates a list of storage objects with custom source names to
-        # create multiple scratch storages.
-        # Stores for each object in the list must be added in the test.
-        custom_source_names = ['"path With Spaces"', 'path With Spaces']
-        storage_mult_obj = []
-        for name in custom_source_names:
-            src_path = os.path.expanduser(f'~/{name}')
-            pathlib.Path(src_path).expanduser().mkdir(exist_ok=True)
-            timestamp = str(time.time()).replace('.', '')
-            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}',
-                                            source=src_path)
-            storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source):
-        # Creates a temporary storage object. Stores must be added in the test.
-        yield from self.yield_storage_object(name=tmp_bucket_name,
-                                             source=tmp_source)
-
-    @pytest.fixture
-    def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source):
-        # Creates a temp storage object which uses a list of paths as source.
-        # Stores must be added in the test. After upload, the bucket should
-        # have two files - /tmp-file and /tmp-source/tmp-file
-        list_source = [tmp_source, tmp_source + '/tmp-file']
-        yield from self.yield_storage_object(name=tmp_bucket_name,
-                                             source=list_source)
-
-    @pytest.fixture
-    def tmp_bulk_del_storage_obj(self, tmp_bucket_name):
-        # Creates a temporary storage object for testing bulk deletion.
-        # Stores must be added in the test.
-        with tempfile.TemporaryDirectory() as tmpdir:
-            subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}',
-                                    shell=True)
-            subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt',
-                                    shell=True)
-            subprocess.check_output(
-                f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True)
-            yield from self.yield_storage_object(name=tmp_bucket_name,
-                                                 source=tmpdir)
-
-    @pytest.fixture
-    def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj):
-        # Creates a copy mount storage which reuses an existing storage object.
-        tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3)
-        storage_name = tmp_scratch_storage_obj.name
-
-        # Try to initialize another storage with the storage object created
-        # above, but now in COPY mode. This should succeed.
-        yield from self.yield_storage_object(name=storage_name,
-                                             mode=storage_lib.StorageMode.COPY)
-
-    @pytest.fixture
-    def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure):
-        # Creates a temporary storage object for testing .gitignore filter.
-        # GITIGINORE_STRUCTURE is representing a file structure in a dictionary
-        # format. Created storage object will contain the file structure along
-        # with .gitignore and .git/info/exclude files to test exclude filter.
-        # Stores must be added in the test.
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Creates file structure to be uploaded in the Storage
-            self.create_dir_structure(tmpdir, gitignore_structure)
-
-            # Create .gitignore and list files/dirs to be excluded in it
-            skypilot_path = os.path.dirname(os.path.dirname(sky.__file__))
-            temp_path = f'{tmpdir}/.gitignore'
-            file_path = os.path.join(skypilot_path, 'tests/gitignore_test')
-            shutil.copyfile(file_path, temp_path)
-
-            # Create .git/info/exclude and list files/dirs to be excluded in it
-            temp_path = f'{tmpdir}/.git/info/'
-            os.makedirs(temp_path)
-            temp_exclude_path = os.path.join(temp_path, 'exclude')
-            file_path = os.path.join(skypilot_path,
-                                     'tests/git_info_exclude_test')
-            shutil.copyfile(file_path, temp_exclude_path)
-
-            # Create sky Storage with the files created
-            yield from self.yield_storage_object(
-                name=tmp_bucket_name,
-                source=tmpdir,
-                mode=storage_lib.StorageMode.COPY)
-
-    @pytest.fixture
-    def tmp_awscli_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using awscli
-        bucket_uri = f's3://{tmp_bucket_name}'
-        subprocess.check_call(['aws', 's3', 'mb', bucket_uri])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force'])
-
-    @pytest.fixture
-    def tmp_gsutil_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using gsutil
-        bucket_uri = f'gs://{tmp_bucket_name}'
-        subprocess.check_call(['gsutil', 'mb', bucket_uri])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri])
-
-    @pytest.fixture
-    def tmp_az_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using gsutil
-        default_region = 'eastus'
-        storage_account_name = (
-            storage_lib.AzureBlobStore.get_default_storage_account_name(
-                default_region))
-        storage_account_key = data_utils.get_az_storage_account_key(
-            storage_account_name)
-        bucket_uri = data_utils.AZURE_CONTAINER_URL.format(
-            storage_account_name=storage_account_name,
-            container_name=tmp_bucket_name)
-        subprocess.check_call([
-            'az', 'storage', 'container', 'create', '--name',
-            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
-            '--account-key', f'{storage_account_key}'
-        ])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call([
-            'az', 'storage', 'container', 'delete', '--name',
-            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
-            '--account-key', f'{storage_account_key}'
-        ])
-
-    @pytest.fixture
-    def tmp_awscli_bucket_r2(self, tmp_bucket_name):
-        # Creates a temporary bucket using awscli
-        endpoint_url = cloudflare.create_endpoint()
-        bucket_uri = f's3://{tmp_bucket_name}'
-        subprocess.check_call(
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2',
-            shell=True)
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2',
-            shell=True)
-
-    @pytest.fixture
-    def tmp_ibm_cos_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using IBM COS API
-        storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name)
-        yield tmp_bucket_name
-        storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_public_storage_obj(self, request):
-        # Initializes a storage object with a public bucket
-        storage_obj = storage_lib.Storage(source=request.param)
-        yield storage_obj
-        # This does not require any deletion logic because it is a public bucket
-        # and should not get added to global_user_state.
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj,
-                                              store_type):
-        # Creates a new bucket with a local source, uploads files to it
-        # and deletes it.
-        tmp_local_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_local_storage_obj.name in out.decode('utf-8')
-
-        # Run sky storage delete to delete the storage object
-        subprocess.check_output(
-            ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes'])
-
-        # Run sky storage ls to check if storage object is deleted
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_local_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.xdist_group('multiple_bucket_deletion')
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm)
-    ])
-    def test_multiple_buckets_creation_and_deletion(
-            self, tmp_multiple_scratch_storage_obj, store_type):
-        # Creates multiple new buckets(5 buckets) with a local source
-        # and deletes them.
-        storage_obj_name = []
-        for store_obj in tmp_multiple_scratch_storage_obj:
-            store_obj.add_store(store_type)
-            storage_obj_name.append(store_obj.name)
-
-        # Run sky storage ls to check if all storage objects exists in the
-        # output filtered by store type
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item in out for item in storage_obj_name])
-
-        # Run sky storage delete all to delete all storage objects
-        delete_cmd = ['sky', 'storage', 'delete', '--yes']
-        delete_cmd += storage_obj_name
-        subprocess.check_output(delete_cmd)
-
-        # Run sky storage ls to check if all storage objects filtered by store
-        # type are deleted
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item not in out for item in storage_obj_name])
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_upload_source_with_spaces(self, store_type,
-                                       tmp_multiple_custom_source_storage_obj):
-        # Creates two buckets with specified local sources
-        # with spaces in the name
-        storage_obj_names = []
-        for storage_obj in tmp_multiple_custom_source_storage_obj:
-            storage_obj.add_store(store_type)
-            storage_obj_names.append(storage_obj.name)
-
-        # Run sky storage ls to check if all storage objects exists in the
-        # output filtered by store type
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item in out for item in storage_obj_names])
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_bucket_external_deletion(self, tmp_scratch_storage_obj,
-                                      store_type):
-        # Creates a bucket, deletes it externally using cloud cli commands
-        # and then tries to delete it using sky storage delete.
-        tmp_scratch_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_scratch_storage_obj.name in out.decode('utf-8')
-
-        # Delete bucket externally
-        cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name)
-        subprocess.check_output(cmd, shell=True)
-
-        # Run sky storage delete to delete the storage object
-        out = subprocess.check_output(
-            ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes'])
-        # Make sure bucket was not created during deletion (see issue #1322)
-        assert 'created' not in out.decode('utf-8').lower()
-
-        # Run sky storage ls to check if storage object is deleted
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_scratch_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj):
-        # Creates a temp folder with over 256 files and folders, upload
-        # files and folders to a new bucket, then delete bucket.
-        tmp_bulk_del_storage_obj.add_store(store_type)
-
-        subprocess.check_output([
-            'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes'
-        ])
-
-        output = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'tmp_public_storage_obj, store_type',
-        [('s3://tcga-2-open', storage_lib.StoreType.S3),
-         ('s3://digitalcorpora', storage_lib.StoreType.S3),
-         ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS),
-         pytest.param(
-             'https://azureopendatastorage.blob.core.windows.net/nyctlc',
-             storage_lib.StoreType.AZURE,
-             marks=pytest.mark.azure)],
-        indirect=['tmp_public_storage_obj'])
-    def test_public_bucket(self, tmp_public_storage_obj, store_type):
-        # Creates a new bucket with a public source and verifies that it is not
-        # added to global_user_state.
-        tmp_public_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_public_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'nonexist_bucket_url',
-        [
-            's3://{random_name}',
-            'gs://{random_name}',
-            pytest.param(
-                'https://{account_name}.blob.core.windows.net/{random_name}',  # pylint: disable=line-too-long
-                marks=pytest.mark.azure),
-            pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm),
-            pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare)
-        ])
-    def test_nonexistent_bucket(self, nonexist_bucket_url):
-        # Attempts to create fetch a stroage with a non-existent source.
-        # Generate a random bucket name and verify it doesn't exist:
-        retry_count = 0
-        while True:
-            nonexist_bucket_name = str(uuid.uuid4())
-            if nonexist_bucket_url.startswith('s3'):
-                command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}'
-                expected_output = '404'
-            elif nonexist_bucket_url.startswith('gs'):
-                command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}'
-                expected_output = 'BucketNotFoundException'
-            elif nonexist_bucket_url.startswith('https'):
-                default_region = 'eastus'
-                storage_account_name = (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-                storage_account_key = data_utils.get_az_storage_account_key(
-                    storage_account_name)
-                command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}'
-                expected_output = '"exists": false'
-            elif nonexist_bucket_url.startswith('r2'):
-                endpoint_url = cloudflare.create_endpoint()
-                command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2'
-                expected_output = '404'
-            elif nonexist_bucket_url.startswith('cos'):
-                # Using API calls, since using rclone requires a profile's name
-                try:
-                    expected_output = command = "echo"  # avoid unrelated exception in case of failure.
-                    bucket_name = urllib.parse.urlsplit(
-                        nonexist_bucket_url.format(
-                            random_name=nonexist_bucket_name)).path.strip('/')
-                    client = ibm.get_cos_client('us-east')
-                    client.head_bucket(Bucket=bucket_name)
-                except ibm.ibm_botocore.exceptions.ClientError as e:
-                    if e.response['Error']['Code'] == '404':
-                        # success
-                        return
-            else:
-                raise ValueError('Unsupported bucket type '
-                                 f'{nonexist_bucket_url}')
-
-            # Check if bucket exists using the cli:
-            try:
-                out = subprocess.check_output(command,
-                                              stderr=subprocess.STDOUT,
-                                              shell=True)
-            except subprocess.CalledProcessError as e:
-                out = e.output
-            out = out.decode('utf-8')
-            if expected_output in out:
-                break
-            else:
-                retry_count += 1
-                if retry_count > 3:
-                    raise RuntimeError('Unable to find a nonexistent bucket '
-                                       'to use. This is higly unlikely - '
-                                       'check if the tests are correct.')
-
-        with pytest.raises(sky.exceptions.StorageBucketGetError,
-                           match='Attempted to use a non-existent'):
-            if nonexist_bucket_url.startswith('https'):
-                storage_obj = storage_lib.Storage(
-                    source=nonexist_bucket_url.format(
-                        account_name=storage_account_name,
-                        random_name=nonexist_bucket_name))
-            else:
-                storage_obj = storage_lib.Storage(
-                    source=nonexist_bucket_url.format(
-                        random_name=nonexist_bucket_name))
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'private_bucket',
-        [
-            f's3://imagenet',
-            f'gs://imagenet',
-            pytest.param('https://smoketestprivate.blob.core.windows.net/test',
-                         marks=pytest.mark.azure),  # pylint: disable=line-too-long
-            pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm)
-        ])
-    def test_private_bucket(self, private_bucket):
-        # Attempts to access private buckets not belonging to the user.
-        # These buckets are known to be private, but may need to be updated if
-        # they are removed by their owners.
-        store_type = urllib.parse.urlsplit(private_bucket).scheme
-        if store_type == 'https' or store_type == 'cos':
-            private_bucket_name = urllib.parse.urlsplit(
-                private_bucket).path.strip('/')
-        else:
-            private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc
-        with pytest.raises(
-                sky.exceptions.StorageBucketGetError,
-                match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format(
-                    name=private_bucket_name)):
-            storage_obj = storage_lib.Storage(source=private_bucket)
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
-                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
-                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
-                              pytest.param('tmp_az_bucket',
-                                           storage_lib.StoreType.AZURE,
-                                           marks=pytest.mark.azure),
-                              pytest.param('tmp_ibm_cos_bucket',
-                                           storage_lib.StoreType.IBM,
-                                           marks=pytest.mark.ibm),
-                              pytest.param('tmp_awscli_bucket_r2',
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_upload_to_existing_bucket(self, ext_bucket_fixture, request,
-                                       tmp_source, store_type):
-        # Tries uploading existing files to newly created bucket (outside of
-        # sky) and verifies that files are written.
-        bucket_name, _ = request.getfixturevalue(ext_bucket_fixture)
-        storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source)
-        storage_obj.add_store(store_type)
-
-        # Check if tmp_source/tmp-file exists in the bucket using aws cli
-        out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-        # Check symlinks - symlinks don't get copied by sky storage
-        assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), (
-            'circle-link was not found in the upload source - '
-            'are the test fixtures correct?')
-        assert 'circle-link' not in out.decode('utf-8'), (
-            'Symlink found in bucket - ls output was : {}'.format(
-                out.decode('utf-8')))
-
-        # Run sky storage ls to check if storage object exists in the output.
-        # It should not exist because the bucket was created externally.
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    def test_copy_mount_existing_storage(self,
-                                         tmp_copy_mnt_existing_storage_obj):
-        # Creates a bucket with no source in MOUNT mode (empty bucket), and
-        # then tries to load the same storage in COPY mode.
-        tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3)
-        storage_name = tmp_copy_mnt_existing_storage_obj.name
-
-        # Check `sky storage ls` to ensure storage object exists
-        out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8')
-        assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.'
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_list_source(self, tmp_local_list_storage_obj, store_type):
-        # Uses a list in the source field to specify a file and a directory to
-        # be uploaded to the storage object.
-        tmp_local_list_storage_obj.add_store(store_type)
-
-        # Check if tmp-file exists in the bucket root using cli
-        out = subprocess.check_output(self.cli_ls_cmd(
-            store_type, tmp_local_list_storage_obj.name),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-        # Check if tmp-file exists in the bucket/tmp-source using cli
-        out = subprocess.check_output(self.cli_ls_cmd(
-            store_type, tmp_local_list_storage_obj.name, 'tmp-source/'),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('invalid_name_list, store_type',
-                             [(AWS_INVALID_NAMES, storage_lib.StoreType.S3),
-                              (GCS_INVALID_NAMES, storage_lib.StoreType.GCS),
-                              pytest.param(AZURE_INVALID_NAMES,
-                                           storage_lib.StoreType.AZURE,
-                                           marks=pytest.mark.azure),
-                              pytest.param(IBM_INVALID_NAMES,
-                                           storage_lib.StoreType.IBM,
-                                           marks=pytest.mark.ibm),
-                              pytest.param(AWS_INVALID_NAMES,
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_invalid_names(self, invalid_name_list, store_type):
-        # Uses a list in the source field to specify a file and a directory to
-        # be uploaded to the storage object.
-        for name in invalid_name_list:
-            with pytest.raises(sky.exceptions.StorageNameError):
-                storage_obj = storage_lib.Storage(name=name)
-                storage_obj.add_store(store_type)
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'gitignore_structure, store_type',
-        [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3),
-         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS),
-         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE),
-         pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE,
-                      storage_lib.StoreType.R2,
-                      marks=pytest.mark.cloudflare)])
-    def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure,
-                                                     store_type,
-                                                     tmp_gitignore_storage_obj):
-        # tests if files included in .gitignore and .git/info/exclude are
-        # excluded from being transferred to Storage
-
-        tmp_gitignore_storage_obj.add_store(store_type)
-
-        upload_file_name = 'included'
-        # Count the number of files with the given file name
-        up_cmd = self.cli_count_name_in_bucket(store_type, \
-            tmp_gitignore_storage_obj.name, file_name=upload_file_name)
-        git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \
-            tmp_gitignore_storage_obj.name, file_name='.git')
-        cnt_num_file_cmd = self.cli_count_file_in_bucket(
-            store_type, tmp_gitignore_storage_obj.name)
-
-        up_output = subprocess.check_output(up_cmd, shell=True)
-        git_exclude_output = subprocess.check_output(git_exclude_cmd,
-                                                     shell=True)
-        cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True)
-
-        assert '3' in up_output.decode('utf-8'), \
-                'Files to be included are not completely uploaded.'
-        # 1 is read as .gitignore is uploaded
-        assert '1' in git_exclude_output.decode('utf-8'), \
-               '.git directory should not be uploaded.'
-        # 4 files include .gitignore, included.log, included.txt, include_dir/included.log
-        assert '4' in cnt_output.decode('utf-8'), \
-               'Some items listed in .gitignore and .git/info/exclude are not excluded.'
-
-    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
-                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
-                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
-                              pytest.param('tmp_awscli_bucket_r2',
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_externally_created_bucket_mount_without_source(
-            self, ext_bucket_fixture, request, store_type):
-        # Non-sky managed buckets(buckets created outside of Skypilot CLI)
-        # are allowed to be MOUNTed by specifying the URI of the bucket to
-        # source field only. When it is attempted by specifying the name of
-        # the bucket only, it should error out.
-        #
-        # TODO(doyoung): Add test for IBM COS. Currently, this is blocked
-        # as rclone used to interact with IBM COS does not support feature to
-        # create a bucket, and the ibmcloud CLI is not supported in Skypilot.
-        # Either of the feature is necessary to simulate an external bucket
-        # creation for IBM COS.
-        # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837
-
-        ext_bucket_name, ext_bucket_uri = request.getfixturevalue(
-            ext_bucket_fixture)
-        # invalid spec
-        with pytest.raises(sky.exceptions.StorageSpecError) as e:
-            storage_obj = storage_lib.Storage(
-                name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT)
-            storage_obj.add_store(store_type)
-
-        assert 'Attempted to mount a non-sky managed bucket' in str(e)
-
-        # valid spec
-        storage_obj = storage_lib.Storage(source=ext_bucket_uri,
-                                          mode=storage_lib.StorageMode.MOUNT)
-        handle = global_user_state.get_handle_from_storage_name(
-            storage_obj.name)
-        if handle:
-            storage_obj.delete()
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('region', [
-        'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1',
-        'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1',
-        'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1',
-        'us-east-2', 'us-west-1', 'us-west-2'
-    ])
-    def test_aws_regions(self, tmp_local_storage_obj, region):
-        # This tests creation and upload to bucket in all AWS s3 regions
-        # To test full functionality, use test_managed_jobs_storage above.
-        store_type = storage_lib.StoreType.S3
-        tmp_local_storage_obj.add_store(store_type, region=region)
-        bucket_name = tmp_local_storage_obj.name
-
-        # Confirm that the bucket was created in the correct region
-        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
-        out = subprocess.check_output(region_cmd, shell=True)
-        output = out.decode('utf-8')
-        expected_output_region = region
-        if region == 'us-east-1':
-            expected_output_region = 'None'  # us-east-1 is the default region
-        assert expected_output_region in out.decode('utf-8'), (
-            f'Bucket was not found in region {region} - '
-            f'output of {region_cmd} was: {output}')
-
-        # Check if tmp_source/tmp-file exists in the bucket using cli
-        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
-        out = subprocess.check_output(ls_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert 'tmp-file' in output, (
-            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('region', [
-        'northamerica-northeast1', 'northamerica-northeast2', 'us-central1',
-        'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2',
-        'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1',
-        'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1',
-        'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6',
-        'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12',
-        'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2',
-        'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2',
-        'asia-southeast2', 'me-central1', 'me-central2', 'me-west1',
-        'australia-southeast1', 'australia-southeast2', 'africa-south1'
-    ])
-    def test_gcs_regions(self, tmp_local_storage_obj, region):
-        # This tests creation and upload to bucket in all GCS regions
-        # To test full functionality, use test_managed_jobs_storage above.
-        store_type = storage_lib.StoreType.GCS
-        tmp_local_storage_obj.add_store(store_type, region=region)
-        bucket_name = tmp_local_storage_obj.name
-
-        # Confirm that the bucket was created in the correct region
-        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
-        out = subprocess.check_output(region_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert region in out.decode('utf-8'), (
-            f'Bucket was not found in region {region} - '
-            f'output of {region_cmd} was: {output}')
-
-        # Check if tmp_source/tmp-file exists in the bucket using cli
-        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
-        out = subprocess.check_output(ls_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert 'tmp-file' in output, (
-            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
-
-
-# ---------- Testing YAML Specs ----------
-# Our sky storage requires credentials to check the bucket existance when
-# loading a task from the yaml file, so we cannot make it a unit test.
-class TestYamlSpecs:
-    # TODO(zhwu): Add test for `to_yaml_config` for the Storage object.
-    #  We should not use `examples/storage_demo.yaml` here, since it requires
-    #  users to ensure bucket names to not exist and/or be unique.
-    _TEST_YAML_PATHS = [
-        'examples/minimal.yaml', 'examples/managed_job.yaml',
-        'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml',
-        'examples/multi_hostname.yaml'
-    ]
-
-    def _is_dict_subset(self, d1, d2):
-        """Check if d1 is the subset of d2."""
-        for k, v in d1.items():
-            if k not in d2:
-                if isinstance(v, list) or isinstance(v, dict):
-                    assert len(v) == 0, (k, v)
-                else:
-                    assert False, (k, v)
-            elif isinstance(v, dict):
-                assert isinstance(d2[k], dict), (k, v, d2)
-                self._is_dict_subset(v, d2[k])
-            elif isinstance(v, str):
-                if k == 'accelerators':
-                    resources = sky.Resources()
-                    resources._set_accelerators(v, None)
-                    assert resources.accelerators == d2[k], (k, v, d2)
-                else:
-                    assert v.lower() == d2[k].lower(), (k, v, d2[k])
-            else:
-                assert v == d2[k], (k, v, d2[k])
-
-    def _check_equivalent(self, yaml_path):
-        """Check if the yaml is equivalent after load and dump again."""
-        origin_task_config = common_utils.read_yaml(yaml_path)
-
-        task = sky.Task.from_yaml(yaml_path)
-        new_task_config = task.to_yaml_config()
-        # d1 <= d2
-        print(origin_task_config, new_task_config)
-        self._is_dict_subset(origin_task_config, new_task_config)
-
-    def test_load_dump_yaml_config_equivalent(self):
-        """Test if the yaml config is equivalent after load and dump again."""
-        pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/tmpfile').expanduser().touch()
-        pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch()
-        pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True,
-                                                           exist_ok=True)
-        for yaml_path in self._TEST_YAML_PATHS:
-            self._check_equivalent(yaml_path)
-
-
-# ---------- Testing Multiple Accelerators ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not support K80 gpus for now
-@pytest.mark.no_paperspace  # Paperspace does not support K80 gpus
-def test_multiple_accelerators_ordered():
-    name = _get_cluster_name()
-    test = Test(
-        'multiple-accelerators-ordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_ordered_with_default():
-    name = _get_cluster_name()
-    test = Test(
-        'multiple-accelerators-ordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status {name} | grep Spot',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_unordered():
-    name = _get_cluster_name()
-    test = Test(
-        'multiple-accelerators-unordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_unordered_with_default():
-    name = _get_cluster_name()
-    test = Test(
-        'multiple-accelerators-unordered-with-default',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status {name} | grep Spot',
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
-def test_multiple_resources():
-    name = _get_cluster_name()
-    test = Test(
-        'multiple-resources',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    run_one_test(test)
-
-
-# ---------- Sky Benchmark ----------
-@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
-@pytest.mark.no_paperspace  # Requires other clouds to be enabled
-@pytest.mark.no_kubernetes
-@pytest.mark.aws  # SkyBenchmark requires S3 access
-def test_sky_bench(generic_cloud: str):
-    name = _get_cluster_name()
-    test = Test(
-        'sky-bench',
-        [
-            f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml',
-            'sleep 120',
-            f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED',
-        ],
-        f'sky bench down {name} -y; sky bench delete {name} -y',
-    )
-    run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_context_failover():
-    """Test if the kubernetes context failover works.
-
-    This test requires two kubernetes clusters:
-    - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs.
-    - another accessible cluster: with enough CPUs
-    To start the first cluster, run:
-      sky local up
-      # Add mock label for accelerator
-      kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot
-      # Get the token for the cluster in context kind-skypilot
-      TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\')
-      # Get the API URL for the cluster in context kind-skypilot
-      API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\')
-      # Add mock capacity for GPU
-      curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status"
-      # Add a new namespace to test the handling of namespaces
-      kubectl create namespace test-namespace --context kind-skypilot
-      # Set the namespace to test-namespace
-      kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot
-    """
-    # Get context that is not kind-skypilot
-    contexts = subprocess.check_output('kubectl config get-contexts -o name',
-                                       shell=True).decode('utf-8').split('\n')
-    context = [context for context in contexts if context != 'kind-skypilot'][0]
-    config = textwrap.dedent(f"""\
-    kubernetes:
-      allowed_contexts:
-        - kind-skypilot
-        - {context}
-    """)
-    with tempfile.NamedTemporaryFile(delete=True) as f:
-        f.write(config.encode('utf-8'))
-        f.flush()
-        name = _get_cluster_name()
-        test = Test(
-            'kubernetes-context-failover',
-            [
-                # Check if kind-skypilot is provisioned with H100 annotations already
-                'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && '
-                'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && '
-                'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || '
-                '{ echo "kind-skypilot does not exist '
-                'or does not have mock labels for GPUs. Check the instructions in '
-                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
-                # Check namespace for kind-skypilot is test-namespace
-                'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
-                '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
-                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
-                'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"',
-                # Get contexts and set current context to the other cluster that is not kind-skypilot
-                f'kubectl config use-context {context}',
-                # H100 should not in the current context
-                '! sky show-gpus --cloud kubernetes | grep H100',
-                f'sky launch -y -c {name}-1 --cpus 1 echo hi',
-                f'sky logs {name}-1 --status',
-                # It should be launched not on kind-skypilot
-                f'sky status -a {name}-1 | grep "{context}"',
-                # Test failure for launching H100 on other cluster
-                f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true',
-                # Test failover
-                f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi',
-                f'sky logs {name}-3 --status',
-                # Test pods
-                f'kubectl get pods --context kind-skypilot | grep "{name}-3"',
-                # It should be launched on kind-skypilot
-                f'sky status -a {name}-3 | grep "kind-skypilot"',
-                # Should be 7 free GPUs
-                f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "  7"',
-                # Remove the line with "kind-skypilot"
-                f'sed -i "/kind-skypilot/d" {f.name}',
-                # Should still be able to exec and launch on existing cluster
-                f'sky exec {name}-3 "echo hi"',
-                f'sky logs {name}-3 --status',
-                f'sky status -r {name}-3 | grep UP',
-                f'sky launch -c {name}-3 --gpus h100 echo hi',
-                f'sky logs {name}-3 --status',
-                f'sky status -r {name}-3 | grep UP',
-            ],
-            f'sky down -y {name}-1 {name}-3',
-            env={'SKYPILOT_CONFIG': f.name},
-        )
-        run_one_test(test)
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index 322c19a266e..37b61caa328 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -1,44 +1,22 @@
-import enum
 import inspect
-import json
 import os
-import pathlib
-import shlex
-import shutil
 import subprocess
 import sys
 import tempfile
-import textwrap
-import time
 from typing import Dict, List, NamedTuple, Optional, Tuple
-import urllib.parse
 import uuid
 
 import colorama
-import jinja2
 import pytest
 
 import sky
-from sky import global_user_state
-from sky import jobs
 from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
 from sky.clouds import AWS
-from sky.clouds import Azure
 from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
 from sky.jobs.state import ManagedJobStatus
-from sky.skylet import constants
-from sky.skylet import events
 from sky.skylet.job_lib import JobStatus
 from sky.status_lib import ClusterStatus
 from sky.utils import common_utils
-from sky.utils import resources_utils
 from sky.utils import subprocess_utils
 
 # To avoid the second smoke test reusing the cluster launched in the first
@@ -64,9 +42,9 @@
 
 # Get the job queue, and print it once on its own, then print it again to
 # use with grep by the caller.
-_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"'
+GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"'
 # Wait for a job to be not in RUNNING state. Used to check for RECOVERING.
-_JOB_WAIT_NOT_RUNNING = (
+JOB_WAIT_NOT_RUNNING = (
     's=$(sky jobs queue);'
     'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do '
     'sleep 10; s=$(sky jobs queue);'
@@ -78,7 +56,7 @@
 _ALL_MANAGED_JOB_STATUSES = "|".join(
     [status.value for status in ManagedJobStatus])
 
-_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
+WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
     # A while loop to wait until the cluster status
     # becomes certain status, with timeout.
     'start_time=$SECONDS; '
@@ -97,9 +75,9 @@
     'done')
 
 
-def _get_cmd_wait_until_cluster_status_contains_wildcard(
+def get_cmd_wait_until_cluster_status_contains_wildcard(
         cluster_name_wildcard: str, cluster_status: str, timeout: int):
-    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
+    wait_cmd = WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
         'sky status {cluster_name}',
         'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
                                                'awk "/^{cluster_name_awk}/')
@@ -110,7 +88,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard(
                            timeout=timeout)
 
 
-_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
+WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
     # A while loop to wait until the cluster is not found or timeout
     'start_time=$SECONDS; '
     'while true; do '
@@ -124,7 +102,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard(
     'sleep 10; '
     'done')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
+WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
     # A while loop to wait until the job status
     # contains certain status, with timeout.
     'start_time=$SECONDS; '
@@ -149,15 +127,15 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard(
     'sleep 10; '
     'done')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "')
 
-_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
 # Managed job functions
 
-_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
+WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
     'sky queue {cluster_name}', 'sky jobs queue').replace(
         'awk "\\$2 == \\"{job_name}\\"',
         'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
@@ -166,7 +144,7 @@ def _get_cmd_wait_until_cluster_status_contains_wildcard(
 # After the timeout, the cluster will stop if autostop is set, and our check
 # should be more than the timeout. To address this, we extend the timeout by
 # _BUMP_UP_SECONDS before exiting.
-_BUMP_UP_SECONDS = 35
+BUMP_UP_SECONDS = 35
 
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
@@ -191,13 +169,13 @@ def echo(self, message: str):
         print(message, file=sys.stderr, flush=True)
 
 
-def _get_timeout(generic_cloud: str,
-                 override_timeout: int = DEFAULT_CMD_TIMEOUT):
+def get_timeout(generic_cloud: str,
+                override_timeout: int = DEFAULT_CMD_TIMEOUT):
     timeouts = {'fluidstack': 60 * 60}  # file_mounts
     return timeouts.get(generic_cloud, override_timeout)
 
 
-def _get_cluster_name() -> str:
+def get_cluster_name() -> str:
     """Returns a user-unique cluster name for each test_<name>().
 
     Must be called from each test_<name>().
@@ -210,7 +188,7 @@ def _get_cluster_name() -> str:
     return f'{test_name}-{test_id}'
 
 
-def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
+def terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
     cluster_name = serve.generate_replica_cluster_name(name, replica_id)
     query_cmd = (f'gcloud compute instances list --filter='
                  f'"(labels.ray-cluster-name:{cluster_name})" '
@@ -352,7 +330,7 @@ def get_gcp_region_for_quota_failover() -> Optional[str]:
     return None
 
 
-_VALIDATE_LAUNCH_OUTPUT = (
+VALIDATE_LAUNCH_OUTPUT = (
     # Validate the output of the job submission:
     # ⚙️ Launching on Kubernetes.
     #   Pod is up.
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
new file mode 100644
index 00000000000..d1dc2129422
--- /dev/null
+++ b/tests/test_smoke.py
@@ -0,0 +1,36 @@
+# Smoke tests for SkyPilot
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/test_smoke.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/test_smoke.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/test_smoke.py::test_minimal
+#
+# Only run managed job tests
+# > pytest tests/test_smoke.py --managed-jobs
+#
+# Only run sky serve tests
+# > pytest tests/test_smoke.py --sky-serve
+#
+# Only run test for AWS + generic tests
+# > pytest tests/test_smoke.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/test_smoke.py --generic-cloud aws
+
+# All files categorized under tests/smoke_tests/*
+# Please add new test cases under that directory.
+from smoke_tests.test_basic import *
+from smoke_tests.test_cluster_job import *
+from smoke_tests.test_images import *
+from smoke_tests.test_managed_job import *
+from smoke_tests.test_mount_and_storage import *
+from smoke_tests.test_region_and_zone import *
+from smoke_tests.test_sky_serve import *
diff --git a/tests/test_yamls/minimal_test_required_before_merge.yaml b/tests/test_yamls/minimal_test_required_before_merge.yaml
new file mode 100644
index 00000000000..aceb5a76cb0
--- /dev/null
+++ b/tests/test_yamls/minimal_test_required_before_merge.yaml
@@ -0,0 +1,13 @@
+resources:
+  cloud: aws
+  instance_type: t3.small
+
+file_mounts:
+  ~/aws: .
+
+workdir: .
+
+num_nodes: 1
+
+run: |
+  ls -l ~/aws/tests/test_yamls/minimal_test_required_before_merge.yaml

From e11a7d123328db6dc486a6323cde2968b00f4380 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 22 Nov 2024 15:40:34 +0800
Subject: [PATCH 26/64] remove unsupport cloud for now

---
 .buildkite/generate_pipeline.py               | 16 +++-
 .buildkite/pipeline_smoke_test_basic.yaml     |  9 --
 .../pipeline_smoke_test_cluster_job.yaml      | 87 -------------------
 .../pipeline_smoke_test_managed_job.yaml      | 33 -------
 ...pipeline_smoke_test_mount_and_storage.yaml | 25 ------
 .../pipeline_smoke_test_region_and_zone.yaml  |  8 --
 .buildkite/pipeline_smoke_test_sky_serve.yaml | 73 ----------------
 7 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index b363c695057..cb135b41a61 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -7,6 +7,10 @@
 import yaml
 
 DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
+# We only have credentials for aws, azure, and gcp.
+# For those test cases that run on other clouds,
+# we currently ignore them.
+ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp']
 
 
 def _get_full_decorator_path(decorator: ast.AST) -> str:
@@ -59,6 +63,16 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                 cloud for cloud in clouds_to_include
                 if cloud not in clouds_to_exclude
             ]
+            final_clouds_to_include = [
+                cloud for cloud in clouds_to_include
+                if cloud in ALL_CLOUDS_WITH_CREDENTIALS
+            ]
+            if clouds_to_include and not final_clouds_to_include:
+                print(f'Warning: {file_path}:{node.name} '
+                      f'is marked to run on {clouds_to_include}, '
+                      f'but we do not have credentials for those clouds. '
+                      f'Skipped.')
+                continue
             function_name = (f'{class_name}::{node.name}'
                              if class_name else node.name)
             function_cloud_map[function_name] = (clouds_to_include)
@@ -100,7 +114,7 @@ def main():
                        '.buildkite/generate_pipeline.py, Please do not '
                        'edit directly.\n')
             yaml.dump(pipeline, file, default_flow_style=False)
-        print(f'Convert {test_file_path} to {yaml_file_path}')
+        print(f'Convert {test_file_path} to {yaml_file_path}\n\n')
 
 
 if __name__ == '__main__':
diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml
index 9c775c1f5fb..d0ba641c48c 100644
--- a/.buildkite/pipeline_smoke_test_basic.yaml
+++ b/.buildkite/pipeline_smoke_test_basic.yaml
@@ -39,10 +39,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_cli_logs on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_scp_logs --scp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_scp_logs on scp
 - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
   env:
     LOG_TO_STDOUT: '1'
@@ -79,11 +75,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_sky_bench on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_kubernetes_context_failover on kubernetes
 - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
     --aws
   env:
diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml
index 3b81274a00a..8a813119eb2 100644
--- a/.buildkite/pipeline_smoke_test_cluster_job.yaml
+++ b/.buildkite/pipeline_smoke_test_cluster_job.yaml
@@ -9,18 +9,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_job_queue_with_docker on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --lambda_cloud
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_lambda_job_queue on lambda_cloud
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_ibm_job_queue on ibm
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_job_queue --scp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_scp_job_queue on scp
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
     --aws
   env:
@@ -35,11 +23,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_fast_large_job_queue on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_ibm_job_queue_multinode
-    --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_ibm_job_queue_multinode on ibm
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
     --aws
   env:
@@ -53,14 +36,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_huggingface on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --lambda_cloud
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_lambda_huggingface on lambda_cloud
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_huggingface --scp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_scp_huggingface on scp
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -77,10 +52,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_tpu_vm_pod on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_tpu_pod_slice_gke on kubernetes
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -104,21 +75,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_azure_http_server_with_custom_ports on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_kubernetes_http_server_with_custom_ports on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_paperspace_http_server_with_custom_ports
-    --paperspace
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_paperspace_http_server_with_custom_ports on paperspace
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports
-    --runpod
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_runpod_http_server_with_custom_ports on runpod
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -127,36 +83,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_task_labels_gcp on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_task_labels_kubernetes on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_container_logs_multinode_kubernetes on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_container_logs_two_jobs_kubernetes on kubernetes
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -177,10 +103,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_autodown on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_scp_autodown --scp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_scp_autodown on scp
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -197,10 +119,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_cancel_pytorch on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_ibm --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cancel_ibm on ibm
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
   env:
     LOG_TO_STDOUT: '1'
@@ -221,11 +139,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_aws_custom_image on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_kubernetes_custom_image on kubernetes
 - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
     --azure
   env:
diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml
index cda2b87a53c..fee2ae1f3c8 100644
--- a/.buildkite/pipeline_smoke_test_managed_job.yaml
+++ b/.buildkite/pipeline_smoke_test_managed_job.yaml
@@ -1,23 +1,5 @@
 # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
 steps:
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs on managed_jobs
-- command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_job_pipeline on managed_jobs
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
-    --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_failed_setup on managed_jobs
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
-    --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_pipeline_failed_setup on managed_jobs
 - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
     --aws
   env:
@@ -38,11 +20,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_managed_jobs_pipeline_recovery_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
-    --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_recovery_default_resources on managed_jobs
 - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
     --aws
   env:
@@ -63,17 +40,7 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_managed_jobs_cancellation_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
-    --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_storage on managed_jobs
 - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
   env:
     LOG_TO_STDOUT: '1'
   label: test_managed_jobs_tpu on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
-    --managed_jobs
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_inline_env on managed_jobs
diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
index 6f1d11e7804..01f8739dd79 100644
--- a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
+++ b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
@@ -4,11 +4,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_file_mounts on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_scp_file_mounts
-    --scp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_scp_file_mounts on scp
 - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
     --aws
   env:
@@ -29,31 +24,11 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_azure_storage_mounts_with_stop on azure
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_kubernetes_storage_mounts on kubernetes
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_kubernetes_context_switch on kubernetes
 - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
     --aws
   env:
     LOG_TO_STDOUT: '1'
   label: test_docker_storage_mounts on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_cloudflare_storage_mounts
-    --cloudflare
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cloudflare_storage_mounts on cloudflare
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_ibm_storage_mounts
-    --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_ibm_storage_mounts on ibm
 - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
     --aws
   env:
diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml
index ae38eb4b594..aa955bc1864 100644
--- a/.buildkite/pipeline_smoke_test_region_and_zone.yaml
+++ b/.buildkite/pipeline_smoke_test_region_and_zone.yaml
@@ -14,10 +14,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_gcp_region_and_service_account on gcp
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_region --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_ibm_region on ibm
 - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
   env:
     LOG_TO_STDOUT: '1'
@@ -26,10 +22,6 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_aws_zone on aws
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_ibm_zone --ibm
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_ibm_zone on ibm
 - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
   env:
     LOG_TO_STDOUT: '1'
diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml
index 0fd84641780..4cd4d35aa4d 100644
--- a/.buildkite/pipeline_smoke_test_sky_serve.yaml
+++ b/.buildkite/pipeline_smoke_test_sky_serve.yaml
@@ -12,94 +12,21 @@ steps:
   env:
     LOG_TO_STDOUT: '1'
   label: test_skyserve_azure_http on azure
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
-    --kubernetes
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_kubernetes_http on kubernetes
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_oci_http --oci
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_oci_http on oci
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_llm on serve
 - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
     --gcp
   env:
     LOG_TO_STDOUT: '1'
   label: test_skyserve_spot_recovery on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_base_ondemand_fallback on serve
 - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
     --gcp
   env:
     LOG_TO_STDOUT: '1'
   label: test_skyserve_dynamic_ondemand_fallback on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_user_bug_restart on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_load_balancer on serve
 - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
     --gcp
   env:
     LOG_TO_STDOUT: '1'
   label: test_skyserve_auto_restart on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_cancel on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_streaming on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_readiness_timeout_fail on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_large_readiness_timeout on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_update on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_rolling_update on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_fast_update on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_update_autoscale on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-    --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_new_autoscaler_update on serve
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --serve
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_failures on serve
 - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
   env:
     LOG_TO_STDOUT: '1'

From 8a651508b960343ec147d0be548977cd589b2edf Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 12:05:32 +0800
Subject: [PATCH 27/64] merge branch 'reliable_smoke_test_more'

---
 tests/smoke_tests/test_basic.py               |  29 ++--
 tests/smoke_tests/test_cluster_job.py         |  48 +++---
 tests/smoke_tests/test_images.py              |  14 +-
 tests/smoke_tests/test_managed_job.py         | 153 +++++++++---------
 tests/smoke_tests/test_region_and_zone.py     |  12 +-
 .../smoke_tests/test_required_before_merge.py |   7 +-
 tests/smoke_tests/util.py                     |  83 ++++++++--
 7 files changed, 209 insertions(+), 137 deletions(-)

diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index 0090ae957b8..1f76254b67d 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -27,13 +27,14 @@
 
 import pytest
 from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
+from smoke_tests.util import (
+    get_cmd_wait_until_job_status_contains_without_matching_job)
 from smoke_tests.util import get_timeout
 from smoke_tests.util import run_one_test
 from smoke_tests.util import SCP_TYPE
 from smoke_tests.util import Test
 from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT
-from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB
 
 import sky
 from sky.skylet import events
@@ -142,9 +143,9 @@ def test_launch_fast_with_autostop(generic_cloud: str):
             f'sky status -r {name} | grep UP',
 
             # Ensure cluster is stopped
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
 
             # Launch again. Do full output validation - we expect the cluster to re-launch
@@ -170,9 +171,9 @@ def test_stale_job(generic_cloud: str):
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=100),
             f'sky start {name} -y',
             f'sky logs {name} 1 --status',
@@ -201,17 +202,17 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=40),
             f'sky launch -c {name} -y "echo hi"',
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+            get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
-                job_status=JobStatus.FAILED_DRIVER.value,
+                job_status=[JobStatus.FAILED_DRIVER],
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
         ],
         f'sky down -y {name}',
@@ -242,9 +243,9 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+            get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
-                job_status=JobStatus.FAILED_DRIVER.value,
+                job_status=[JobStatus.FAILED_DRIVER],
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
         ],
         f'sky down -y {name}',
@@ -354,9 +355,9 @@ def test_core_api_sky_launch_fast(generic_cloud: str):
                    idle_minutes_to_autostop=1,
                    fast=True)
         # Sleep to let the cluster autostop
-        WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+        get_cmd_wait_until_cluster_status_contains(
             cluster_name=name,
-            cluster_status=ClusterStatus.STOPPED,
+            cluster_status=[ClusterStatus.STOPPED],
             timeout=120)
         # Run it again - should work with fast=True
         sky.launch(task,
diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index 22b6d9dc8f0..5fce0c2208c 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -28,6 +28,9 @@
 from smoke_tests.util import BUMP_UP_SECONDS
 from smoke_tests.util import get_aws_region_for_quota_failover
 from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
+from smoke_tests.util import (
+    get_cmd_wait_until_job_status_contains_matching_job_id)
 from smoke_tests.util import get_gcp_region_for_quota_failover
 from smoke_tests.util import get_timeout
 from smoke_tests.util import LAMBDA_TYPE
@@ -35,8 +38,6 @@
 from smoke_tests.util import SCP_GPU_V100
 from smoke_tests.util import SCP_TYPE
 from smoke_tests.util import Test
-from smoke_tests.util import WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
-from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
 
 import sky
 from sky import AWS
@@ -419,10 +420,10 @@ def test_multi_echo(generic_cloud: str):
         ] +
         # Ensure jobs succeeded.
         [
-            WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+            get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
                 job_id=i + 1,
-                job_status=JobStatus.SUCCEEDED.value,
+                job_status=[JobStatus.SUCCEEDED],
                 timeout=120) for i in range(32)
         ] +
         # Ensure monitor/autoscaler didn't crash on the 'assert not
@@ -996,17 +997,16 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT],
                 timeout=200),
         ],
         f'sky down -y {name}',
@@ -1030,10 +1030,9 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT],
                 timeout=280) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
         ],
@@ -1071,9 +1070,9 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
 
             # Ensure the cluster is UP and the autostop setting is reset ('-').
@@ -1090,9 +1089,9 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
 
             # Test restarting the idleness timer via exec:
@@ -1102,9 +1101,9 @@ def test_autostop(generic_cloud: str):
             'sleep 45',  # Almost reached the threshold.
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout + BUMP_UP_SECONDS),
         ],
         f'sky down -y {name}',
@@ -1322,18 +1321,18 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=90),
             f'sky start {name} -y',
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=120),
         ],
         f'sky down -y {name}',
@@ -1439,10 +1438,9 @@ def test_azure_start_stop_two_nodes():
             f'sky start -y {name} -i 1',
             f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.INIT.value}|{ClusterStatus.STOPPED.value})',
+                cluster_status=[ClusterStatus.INIT, ClusterStatus.STOPPED],
                 timeout=200 + BUMP_UP_SECONDS) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
         ],
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
index e2e4c440b89..b66211d016d 100644
--- a/tests/smoke_tests/test_images.py
+++ b/tests/smoke_tests/test_images.py
@@ -20,9 +20,9 @@
 # > pytest tests/smoke_tests/test_images.py --generic-cloud aws
 
 import pytest
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND
-from smoke_tests.util import _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS
 from smoke_tests.util import get_cluster_name
+from smoke_tests.util import get_cmd_wait_until_cluster_is_not_found
+from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
 from smoke_tests.util import run_one_test
 from smoke_tests.util import Test
 
@@ -279,9 +279,9 @@ def test_clone_disk_aws():
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+            get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=60),
             # Wait for EC2 instance to be in stopped state.
             # TODO: event based wait.
@@ -331,7 +331,7 @@ def test_gcp_mig():
             # Check MIG exists.
             f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
             f'sky autostop -i 0 --down -y {name}',
-            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
+            get_cmd_wait_until_cluster_is_not_found(cluster_name=name,
                                                     timeout=120),
             f'gcloud compute instance-templates list | grep "sky-it-{name}"',
             # Launch again with the same region. The original instance template
@@ -399,9 +399,9 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+        get_cmd_wait_until_cluster_status_contains(
             cluster_name=name,
-            cluster_status=ClusterStatus.STOPPED.value,
+            cluster_status=[ClusterStatus.STOPPED],
             timeout=80),
         f'sky start -y {name}',
         f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index 521b08797f5..5f3e3b2117c 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -29,14 +29,14 @@
 import pytest
 from smoke_tests.util import _BUMP_UP_SECONDS
 from smoke_tests.util import get_cluster_name
+from smoke_tests.util import (
+    get_cmd_wait_until_managed_job_status_contains_matching_job_name)
 from smoke_tests.util import GET_JOB_QUEUE
 from smoke_tests.util import JOB_WAIT_NOT_RUNNING
 from smoke_tests.util import run_one_test
 from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
 from smoke_tests.util import TestStorageWithCredentials
-from smoke_tests.util import (
-    WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
 
 from sky import jobs
 from sky.data import storage as storage_lib
@@ -58,20 +58,24 @@ def test_managed_jobs(generic_cloud: str):
         [
             f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
-                job_status=
-                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
+                job_status=[
+                    ManagedJobStatus.PENDING, ManagedJobStatus.INIT,
+                    ManagedJobStatus.RUNNING
+                ],
                 timeout=60),
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=
-                f'({ManagedJobStatus.PENDING.value}|{ManagedJobStatus.INIT.value}|{ManagedJobStatus.RUNNING.value})',
+                job_status=[
+                    ManagedJobStatus.PENDING, ManagedJobStatus.INIT,
+                    ManagedJobStatus.RUNNING
+                ],
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
-                job_status=f'{ManagedJobStatus.CANCELLED.value}',
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
@@ -169,9 +173,9 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
         'managed_jobs_pipeline_failed_setup',
         [
             f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
+                job_status=[ManagedJobStatus.FAILED_SETUP],
                 timeout=600),
             # Make sure the job failed quickly.
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
@@ -206,9 +210,9 @@ def test_managed_jobs_recovery_aws(aws_config_region):
         'managed_jobs_recovery_aws',
         [
             f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=600),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
@@ -219,9 +223,9 @@ def test_managed_jobs_recovery_aws(aws_config_region):
              '--output text)'),
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
@@ -250,18 +254,18 @@ def test_managed_jobs_recovery_gcp():
         'managed_jobs_recovery_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=300),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             terminate_cmd,
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
         ],
@@ -285,9 +289,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
         'managed_jobs_pipeline_recovery_aws',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
@@ -307,9 +311,9 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
                 '--output text)'),
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
@@ -340,9 +344,9 @@ def test_managed_jobs_pipeline_recovery_gcp():
         'managed_jobs_pipeline_recovery_gcp',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
@@ -354,9 +358,9 @@ def test_managed_jobs_pipeline_recovery_gcp():
              f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
@@ -383,10 +387,11 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str):
         'managed-spot-recovery-default-resources',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=
-                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.RECOVERING.value})',
+                job_status=[
+                    ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING
+                ],
                 timeout=360),
         ],
         f'sky jobs cancel -y -n {name}',
@@ -407,9 +412,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
         'managed_jobs_recovery_multi_node_aws',
         [
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=450),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
@@ -421,9 +426,9 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
              '--output text)'),
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
@@ -452,18 +457,18 @@ def test_managed_jobs_recovery_multi_node_gcp():
         'managed_jobs_recovery_multi_node_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             terminate_cmd,
             JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
@@ -489,15 +494,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=
-                f'({ManagedJobStatus.STARTING.value}|{ManagedJobStatus.RUNNING.value})',
+                job_status=[
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                ],
                 timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
@@ -507,14 +513,14 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             # The job is set up in the cluster, will shown as RUNNING.
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
@@ -524,9 +530,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
             # The job is running in the cluster, will shown as RUNNING.
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -537,9 +543,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
@@ -575,41 +581,41 @@ def test_managed_jobs_cancellation_gcp():
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.STARTING.value,
+                job_status=[ManagedJobStatus.STARTING],
                 timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             # The job is set up in the cluster, will shown as RUNNING.
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=ManagedJobStatus.RUNNING.value,
+                job_status=[ManagedJobStatus.RUNNING],
                 timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             terminate_cmd,
             JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=ManagedJobStatus.CANCELLED.value,
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
@@ -700,9 +706,9 @@ def test_managed_jobs_storage(generic_cloud: str):
                 *STORAGE_SETUP_COMMANDS,
                 f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
                 region_validation_cmd,  # Check if the bucket is created in the correct region
-                WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                     job_name=name,
-                    job_status=ManagedJobStatus.SUCCEEDED.value,
+                    job_status=[ManagedJobStatus.SUCCEEDED],
                     timeout=60 + _BUMP_UP_SECONDS),
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
@@ -727,15 +733,16 @@ def test_managed_jobs_tpu():
         'test-spot-tpu',
         [
             f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.STARTING.value,
+                job_status=[ManagedJobStatus.STARTING],
                 timeout=60 + _BUMP_UP_SECONDS),
             # TPU takes a while to launch
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=
-                f'({ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.SUCCEEDED.value})',
+                job_status=[
+                    ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED
+                ],
                 timeout=900 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
@@ -754,9 +761,9 @@ def test_managed_jobs_inline_env(generic_cloud: str):
         'test-managed-jobs-inline-env',
         [
             f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=ManagedJobStatus.SUCCEEDED.value,
+                job_status=[ManagedJobStatus.SUCCEEDED],
                 timeout=20 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index 3000c82068d..bbfe3874315 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -25,10 +25,10 @@
 import pytest
 from smoke_tests.util import get_cluster_name
 from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard
+from smoke_tests.util import (
+    get_cmd_wait_until_managed_job_status_contains_matching_job_name)
 from smoke_tests.util import run_one_test
 from smoke_tests.util import Test
-from smoke_tests.util import (
-    WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME)
 
 from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
@@ -87,10 +87,12 @@ def test_aws_with_ssh_proxy_command():
                     cluster_status=ClusterStatus.UP.value,
                     timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
-                WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                     job_name=name,
-                    job_status=
-                    f'({ManagedJobStatus.SUCCEEDED.value}|{ManagedJobStatus.RUNNING.value}|{ManagedJobStatus.STARTING.value})',
+                    job_status=[
+                        ManagedJobStatus.SUCCEEDED, ManagedJobStatus.RUNNING,
+                        ManagedJobStatus.STARTING
+                    ],
                     timeout=300),
             ],
             f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py
index dd368718821..677db104549 100644
--- a/tests/smoke_tests/test_required_before_merge.py
+++ b/tests/smoke_tests/test_required_before_merge.py
@@ -20,9 +20,10 @@
 # > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws
 
 from smoke_tests.util import get_cluster_name
+from smoke_tests.util import (
+    get_cmd_wait_until_job_status_contains_matching_job_id)
 from smoke_tests.util import run_one_test
 from smoke_tests.util import Test
-from smoke_tests.util import WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID
 
 from sky.skylet import events
 from sky.skylet.job_lib import JobStatus
@@ -34,10 +35,10 @@ def test_yaml_launch_and_mount(generic_cloud: str):
         'test_yaml_launch_and_mount',
         [
             f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml',
-            WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+            get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
                 job_id=1,
-                job_status=JobStatus.SUCCEEDED.value,
+                job_status=[JobStatus.SUCCEEDED],
                 timeout=2 * 60),
         ],
         f'sky down -y {name}',
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index 37b61caa328..0e5c4dd2d8d 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -1,3 +1,4 @@
+import enum
 import inspect
 import os
 import subprocess
@@ -56,7 +57,16 @@
 _ALL_MANAGED_JOB_STATUSES = "|".join(
     [status.value for status in ManagedJobStatus])
 
-WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
+
+def _statuses_to_str(statuses: List[enum.Enum]):
+    """Convert a list of enums to a string with all the values separated by |."""
+    if len(statuses) > 1:
+        return '(' + '|'.join([status.value for status in statuses]) + ')'
+    else:
+        return statuses[0].value
+
+
+_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
     # A while loop to wait until the cluster status
     # becomes certain status, with timeout.
     'start_time=$SECONDS; '
@@ -75,20 +85,29 @@
     'done')
 
 
+def get_cmd_wait_until_cluster_status_contains(
+        cluster_name: str, cluster_status: List[ClusterStatus], timeout: int):
+    return _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+        cluster_name=cluster_name,
+        cluster_status=_statuses_to_str(cluster_status),
+        timeout=timeout)
+
+
 def get_cmd_wait_until_cluster_status_contains_wildcard(
-        cluster_name_wildcard: str, cluster_status: str, timeout: int):
-    wait_cmd = WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
+        cluster_name_wildcard: str, cluster_status: List[ClusterStatus],
+        timeout: int):
+    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
         'sky status {cluster_name}',
         'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
                                                'awk "/^{cluster_name_awk}/')
     return wait_cmd.format(cluster_name=cluster_name_wildcard,
                            cluster_name_awk=cluster_name_wildcard.replace(
                                '*', '.*'),
-                           cluster_status=cluster_status,
+                           cluster_status=_statuses_to_str(cluster_status),
                            timeout=timeout)
 
 
-WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
+_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
     # A while loop to wait until the cluster is not found or timeout
     'start_time=$SECONDS; '
     'while true; do '
@@ -98,11 +117,17 @@ def get_cmd_wait_until_cluster_status_contains_wildcard(
     'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
     '  echo "Cluster {cluster_name} successfully removed."; break; '
     'fi; '
-    'echo "Waiting for cluster {name} to be removed..."; '
+    'echo "Waiting for cluster {cluster_name} to be removed..."; '
     'sleep 10; '
     'done')
 
-WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
+
+def get_cmd_wait_until_cluster_is_not_found(cluster_name: str, timeout: int):
+    return _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=cluster_name,
+                                                   timeout=timeout)
+
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
     # A while loop to wait until the job status
     # contains certain status, with timeout.
     'start_time=$SECONDS; '
@@ -127,20 +152,58 @@ def get_cmd_wait_until_cluster_status_contains_wildcard(
     'sleep 10; '
     'done')
 
-WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "')
 
-WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
+
+def get_cmd_wait_until_job_status_contains_matching_job_id(
+        cluster_name: str, job_id: str, job_status: List[JobStatus],
+        timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+        cluster_name=cluster_name,
+        job_id=job_id,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
+def get_cmd_wait_until_job_status_contains_without_matching_job(
+        cluster_name: str, job_status: List[JobStatus], timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+        cluster_name=cluster_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
+def get_cmd_wait_until_job_status_contains_matching_job_name(
+        cluster_name: str, job_name: str, job_status: List[JobStatus],
+        timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+        cluster_name=cluster_name,
+        job_name=job_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
 # Managed job functions
 
-WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
+_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
     'sky queue {cluster_name}', 'sky jobs queue').replace(
         'awk "\\$2 == \\"{job_name}\\"',
         'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
             _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES)
 
+
+def get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+        job_name: str, job_status: List[JobStatus], timeout: int):
+    return _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+        job_name=job_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
 # After the timeout, the cluster will stop if autostop is set, and our check
 # should be more than the timeout. To address this, we extend the timeout by
 # _BUMP_UP_SECONDS before exiting.

From 41bac9bc9eb838dfbf4bd14d56ac15cc7fca4be1 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 12:13:52 +0800
Subject: [PATCH 28/64] bug fix

---
 tests/smoke_tests/test_managed_job.py | 40 +++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index 5f3e3b2117c..f41e0a6c2ca 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -27,7 +27,7 @@
 import time
 
 import pytest
-from smoke_tests.util import _BUMP_UP_SECONDS
+from smoke_tests.util import BUMP_UP_SECONDS
 from smoke_tests.util import get_cluster_name
 from smoke_tests.util import (
     get_cmd_wait_until_managed_job_status_contains_matching_job_name)
@@ -147,10 +147,10 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
             # Make sure the job failed quickly.
-            WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=f'{ManagedJobStatus.FAILED_SETUP.value}',
-                timeout=330 + _BUMP_UP_SECONDS),
+                job_status=[ManagedJobStatus.FAILED_SETUP],
+                timeout=330 + BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -499,12 +499,12 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
                 job_status=[
                     ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
                 ],
-                timeout=60 + _BUMP_UP_SECONDS),
+                timeout=60 + BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -516,12 +516,12 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + _BUMP_UP_SECONDS),
+                timeout=300 + BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -533,7 +533,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
                 job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + _BUMP_UP_SECONDS),
+                timeout=300 + BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -546,7 +546,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$(aws ec2 describe-instances --region {region} '
@@ -584,30 +584,30 @@ def test_managed_jobs_cancellation_gcp():
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[ManagedJobStatus.STARTING],
-                timeout=60 + _BUMP_UP_SECONDS),
+                timeout=60 + BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             # The job is set up in the cluster, will shown as RUNNING.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + _BUMP_UP_SECONDS),
+                timeout=300 + BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
                 job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + _BUMP_UP_SECONDS),
+                timeout=300 + BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             terminate_cmd,
             JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
@@ -616,7 +616,7 @@ def test_managed_jobs_cancellation_gcp():
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
                 job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + _BUMP_UP_SECONDS),
+                timeout=120 + BUMP_UP_SECONDS),
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
@@ -709,7 +709,7 @@ def test_managed_jobs_storage(generic_cloud: str):
                 get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                     job_name=name,
                     job_status=[ManagedJobStatus.SUCCEEDED],
-                    timeout=60 + _BUMP_UP_SECONDS),
+                    timeout=60 + BUMP_UP_SECONDS),
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
                 output_check_cmd
@@ -736,14 +736,14 @@ def test_managed_jobs_tpu():
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[ManagedJobStatus.STARTING],
-                timeout=60 + _BUMP_UP_SECONDS),
+                timeout=60 + BUMP_UP_SECONDS),
             # TPU takes a while to launch
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[
                     ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED
                 ],
-                timeout=900 + _BUMP_UP_SECONDS),
+                timeout=900 + BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -764,7 +764,7 @@ def test_managed_jobs_inline_env(generic_cloud: str):
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[ManagedJobStatus.SUCCEEDED],
-                timeout=20 + _BUMP_UP_SECONDS),
+                timeout=20 + BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.

From fd46f09a135c1c8e2bce702b66493b4c53279186 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 13:05:59 +0800
Subject: [PATCH 29/64] bug fix

---
 tests/smoke_tests/test_managed_job.py       | 2 +-
 tests/smoke_tests/test_mount_and_storage.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index f41e0a6c2ca..44ab29705ea 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -27,6 +27,7 @@
 import time
 
 import pytest
+from smoke_tests.test_mount_and_storage import TestStorageWithCredentials
 from smoke_tests.util import BUMP_UP_SECONDS
 from smoke_tests.util import get_cluster_name
 from smoke_tests.util import (
@@ -36,7 +37,6 @@
 from smoke_tests.util import run_one_test
 from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
-from smoke_tests.util import TestStorageWithCredentials
 
 from sky import jobs
 from sky.data import storage as storage_lib
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
index 95952d3b432..6a2f0944fec 100644
--- a/tests/smoke_tests/test_mount_and_storage.py
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -38,7 +38,6 @@
 from smoke_tests.util import SCP_TYPE
 from smoke_tests.util import STORAGE_SETUP_COMMANDS
 from smoke_tests.util import Test
-from smoke_tests.util import TestStorageWithCredentials
 
 import sky
 from sky import global_user_state

From dc71b72a9990d7709f409cc1f425f1e77341001e Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 13:17:35 +0800
Subject: [PATCH 30/64] bug fix

---
 tests/smoke_tests/test_basic.py       | 5 ++++-
 tests/smoke_tests/test_managed_job.py | 8 ++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index 1f76254b67d..8239b25db35 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -26,6 +26,7 @@
 import time
 
 import pytest
+from smoke_tests.util import BUMP_UP_SECONDS
 from smoke_tests.util import get_cluster_name
 from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
 from smoke_tests.util import (
@@ -147,7 +148,9 @@ def test_launch_fast_with_autostop(generic_cloud: str):
                 cluster_name=name,
                 cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
-
+            # Even the cluster is stopped, cloud platform may take a while to
+            # delete the VM.
+            f'sleep {BUMP_UP_SECONDS}',
             # Launch again. Do full output validation - we expect the cluster to re-launch
             f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 2 --status',
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index 44ab29705ea..e8d13c21354 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -61,15 +61,15 @@ def test_managed_jobs(generic_cloud: str):
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
                 job_status=[
-                    ManagedJobStatus.PENDING, ManagedJobStatus.INIT,
-                    ManagedJobStatus.RUNNING
+                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
                 ],
                 timeout=60),
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[
-                    ManagedJobStatus.PENDING, ManagedJobStatus.INIT,
-                    ManagedJobStatus.RUNNING
+                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
                 ],
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',

From e68430be2e17cf950f0bbdae9c9eff8e49a5be89 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 15:17:12 +0800
Subject: [PATCH 31/64] test pipeline pre merge

---
 .buildkite/pipeline_pre_merge.yaml | 11 +++++++++++
 .buildkite/trigger_build.sh        | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 .buildkite/pipeline_pre_merge.yaml
 create mode 100644 .buildkite/trigger_build.sh

diff --git a/.buildkite/pipeline_pre_merge.yaml b/.buildkite/pipeline_pre_merge.yaml
new file mode 100644
index 00000000000..4edeb3328fd
--- /dev/null
+++ b/.buildkite/pipeline_pre_merge.yaml
@@ -0,0 +1,11 @@
+steps:
+  - label: "Validation check"
+    command: "./buildkite/trigger_build.sh pre-merge-test"
+    key: "validation-check"
+  - label: "Run pre merge tests"
+    command: |
+      if [ $$(buildkite-agent step get "outcome" --step "validation-check") == "passed" ]; then
+        buildkite-agent pipeline upload .buildkite/pipeline_smoke_test_required_before_merge.yaml
+      else
+        echo "Didn't pass validation, nothing to run"
+      fi
diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh
new file mode 100644
index 00000000000..f40c64f60c8
--- /dev/null
+++ b/.buildkite/trigger_build.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Parse the webhook payload (read from stdin)
+PAYLOAD=$(cat)
+
+# Define the allowed user(s)
+ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames
+
+# Extract comment body and user info
+COMMENT_BODY=$(echo "$PAYLOAD" | jq -r '.comment.body')
+COMMENT_USER=$(echo "$PAYLOAD" | jq -r '.comment.user.login')
+
+# Read the keyword from the first argument
+KEYWORD="$1"
+
+# Check if the comment contains the keyword and the user is authorized
+if [[ "$COMMENT_BODY" == *"$KEYWORD"* ]] &&
+   ( [[ " ${ALLOWED_USERS[@]} " =~ " $COMMENT_USER " ]]); then
+    echo "Triggering build because $KEYWORD was mentioned by authorized user: $COMMENT_USER"
+    exit 0  # Exit with success to continue the build
+else
+    echo "Build not triggered. Either $KEYWORD not found or user not authorized."
+    exit 1  # Exit with failure to stop the build
+fi

From d2ab7baf92e2960b61abd3e513d15f42b0d2f389 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 15:24:56 +0800
Subject: [PATCH 32/64] build test

---
 .buildkite/trigger_build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh
index f40c64f60c8..ff57b506c9d 100644
--- a/.buildkite/trigger_build.sh
+++ b/.buildkite/trigger_build.sh
@@ -3,6 +3,9 @@
 # Parse the webhook payload (read from stdin)
 PAYLOAD=$(cat)
 
+echo "PAYLOAD: $PAYLOAD"
+echo "KEYWORD: $1"
+
 # Define the allowed user(s)
 ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames
 

From d2a065e1187990cae878ead7c18d3b2e26b86edc Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 15:52:58 +0800
Subject: [PATCH 33/64] test again

---
 .buildkite/generate_pipeline.py    | 121 -----------------------------
 .buildkite/pipeline_pre_merge.yaml |  11 ---
 .buildkite/trigger_build.sh        |  27 -------
 3 files changed, 159 deletions(-)
 delete mode 100644 .buildkite/generate_pipeline.py
 delete mode 100644 .buildkite/pipeline_pre_merge.yaml
 delete mode 100644 .buildkite/trigger_build.sh

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
deleted file mode 100644
index cb135b41a61..00000000000
--- a/.buildkite/generate_pipeline.py
+++ /dev/null
@@ -1,121 +0,0 @@
-"""This script generates a Buildkite pipeline from test files."""
-import ast
-import copy
-import os
-from typing import Any, Dict, List
-
-import yaml
-
-DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
-# We only have credentials for aws, azure, and gcp.
-# For those test cases that run on other clouds,
-# we currently ignore them.
-ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp']
-
-
-def _get_full_decorator_path(decorator: ast.AST) -> str:
-    """Recursively get the full path of a decorator."""
-    if isinstance(decorator, ast.Attribute):
-        return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}'
-    elif isinstance(decorator, ast.Name):
-        return decorator.id
-    elif isinstance(decorator, ast.Call):
-        return _get_full_decorator_path(decorator.func)
-    raise ValueError(f'Unknown decorator type: {type(decorator)}')
-
-
-def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
-    """Extract test functions and filter clouds with pytest.mark
-    from a Python test file."""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        tree = ast.parse(file.read(), filename=file_path)
-
-    for node in ast.walk(tree):
-        for child in ast.iter_child_nodes(node):
-            setattr(child, 'parent', node)
-
-    function_cloud_map = {}
-    for node in ast.walk(tree):
-        if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
-            class_name = None
-            if hasattr(node, 'parent') and isinstance(node.parent,
-                                                      ast.ClassDef):
-                class_name = node.parent.name
-
-            clouds_to_include = []
-            clouds_to_exclude = []
-            for decorator in node.decorator_list:
-                if isinstance(decorator, ast.Call):
-                    # We only need to consider the decorator with no arguments
-                    # to extract clouds.
-                    continue
-                full_path = _get_full_decorator_path(decorator)
-                if full_path.startswith('pytest.mark.'):
-                    assert isinstance(decorator, ast.Attribute)
-                    suffix = decorator.attr
-                    if suffix.startswith('no_'):
-                        clouds_to_exclude.append(suffix[3:])
-                    else:
-                        clouds_to_include.append(suffix)
-            clouds_to_include = (clouds_to_include if clouds_to_include else
-                                 copy.deepcopy(DEFAULT_CLOUDS_TO_RUN))
-            clouds_to_include = [
-                cloud for cloud in clouds_to_include
-                if cloud not in clouds_to_exclude
-            ]
-            final_clouds_to_include = [
-                cloud for cloud in clouds_to_include
-                if cloud in ALL_CLOUDS_WITH_CREDENTIALS
-            ]
-            if clouds_to_include and not final_clouds_to_include:
-                print(f'Warning: {file_path}:{node.name} '
-                      f'is marked to run on {clouds_to_include}, '
-                      f'but we do not have credentials for those clouds. '
-                      f'Skipped.')
-                continue
-            function_name = (f'{class_name}::{node.name}'
-                             if class_name else node.name)
-            function_cloud_map[function_name] = (clouds_to_include)
-    return function_cloud_map
-
-
-def _generate_pipeline(test_file: str) -> Dict[str, Any]:
-    """Generate a Buildkite pipeline from test files."""
-    steps = []
-    function_cloud_map = _extract_marked_tests(test_file)
-    for test_function, clouds in function_cloud_map.items():
-        for cloud in clouds:
-            step = {
-                'label': f'{test_function} on {cloud}',
-                'command': f'pytest {test_file}::{test_function} --{cloud}',
-                'env': {
-                    'LOG_TO_STDOUT': '1'
-                }
-            }
-            steps.append(step)
-            # we only run one cloud per test function for now
-            break
-    return {'steps': steps}
-
-
-def main():
-    # List of test files to include in the pipeline
-    test_files = os.listdir('tests/smoke_tests')
-
-    for test_file in test_files:
-        if not test_file.startswith('test_'):
-            continue
-        test_file_path = os.path.join('tests/smoke_tests', test_file)
-        pipeline = _generate_pipeline(test_file_path)
-        yaml_file_path = '.buildkite/pipeline_smoke_' + \
-            f'{test_file.split(".")[0]}.yaml'
-        with open(yaml_file_path, 'w', encoding='utf-8') as file:
-            file.write('# This is an auto-generated Buildkite pipeline by '
-                       '.buildkite/generate_pipeline.py, Please do not '
-                       'edit directly.\n')
-            yaml.dump(pipeline, file, default_flow_style=False)
-        print(f'Convert {test_file_path} to {yaml_file_path}\n\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/.buildkite/pipeline_pre_merge.yaml b/.buildkite/pipeline_pre_merge.yaml
deleted file mode 100644
index 4edeb3328fd..00000000000
--- a/.buildkite/pipeline_pre_merge.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-steps:
-  - label: "Validation check"
-    command: "./buildkite/trigger_build.sh pre-merge-test"
-    key: "validation-check"
-  - label: "Run pre merge tests"
-    command: |
-      if [ $$(buildkite-agent step get "outcome" --step "validation-check") == "passed" ]; then
-        buildkite-agent pipeline upload .buildkite/pipeline_smoke_test_required_before_merge.yaml
-      else
-        echo "Didn't pass validation, nothing to run"
-      fi
diff --git a/.buildkite/trigger_build.sh b/.buildkite/trigger_build.sh
deleted file mode 100644
index ff57b506c9d..00000000000
--- a/.buildkite/trigger_build.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Parse the webhook payload (read from stdin)
-PAYLOAD=$(cat)
-
-echo "PAYLOAD: $PAYLOAD"
-echo "KEYWORD: $1"
-
-# Define the allowed user(s)
-ALLOWED_USERS=("zpoint" "Michaelvll" "concretevitamin" "romilbhardwaj" "cg505" "yika-luo") # GitHub usernames
-
-# Extract comment body and user info
-COMMENT_BODY=$(echo "$PAYLOAD" | jq -r '.comment.body')
-COMMENT_USER=$(echo "$PAYLOAD" | jq -r '.comment.user.login')
-
-# Read the keyword from the first argument
-KEYWORD="$1"
-
-# Check if the comment contains the keyword and the user is authorized
-if [[ "$COMMENT_BODY" == *"$KEYWORD"* ]] &&
-   ( [[ " ${ALLOWED_USERS[@]} " =~ " $COMMENT_USER " ]]); then
-    echo "Triggering build because $KEYWORD was mentioned by authorized user: $COMMENT_USER"
-    exit 0  # Exit with success to continue the build
-else
-    echo "Build not triggered. Either $KEYWORD not found or user not authorized."
-    exit 1  # Exit with failure to stop the build
-fi

From ab6a3112d2380e819bf95aca0036f29616f0a2bc Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 25 Nov 2024 18:13:46 +0800
Subject: [PATCH 34/64] trigger test

---
 tests/test_smoke.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index d1dc2129422..c872fd589f8 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -27,6 +27,7 @@
 
 # All files categorized under tests/smoke_tests/*
 # Please add new test cases under that directory.
+
 from smoke_tests.test_basic import *
 from smoke_tests.test_cluster_job import *
 from smoke_tests.test_images import *

From 2ada082d451244346bd875b9cd0b2cd422010514 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Tue, 26 Nov 2024 14:04:04 +0800
Subject: [PATCH 35/64] bug fix

---
 tests/smoke_tests/test_region_and_zone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index bbfe3874315..481d1488071 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -84,7 +84,7 @@ def test_aws_with_ssh_proxy_command():
                 # the job controller is not launched with proxy command.
                 get_cmd_wait_until_cluster_status_contains_wildcard(
                     cluster_name_wildcard='sky-jobs-controller-*',
-                    cluster_status=ClusterStatus.UP.value,
+                    cluster_status=[ClusterStatus.UP],
                     timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
                 get_cmd_wait_until_managed_job_status_contains_matching_job_name(

From 9e1416827ec871485ef871be0ec08f54578494ea Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 27 Nov 2024 16:24:27 +0800
Subject: [PATCH 36/64] generate pipeline

---
 .buildkite/generate_pipeline.py | 121 ++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 .buildkite/generate_pipeline.py

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
new file mode 100644
index 00000000000..cb135b41a61
--- /dev/null
+++ b/.buildkite/generate_pipeline.py
@@ -0,0 +1,121 @@
+"""This script generates a Buildkite pipeline from test files."""
+import ast
+import copy
+import os
+from typing import Any, Dict, List
+
+import yaml
+
+DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
+# We only have credentials for aws, azure, and gcp.
+# For those test cases that run on other clouds,
+# we currently ignore them.
+ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp']
+
+
+def _get_full_decorator_path(decorator: ast.AST) -> str:
+    """Recursively get the full path of a decorator."""
+    if isinstance(decorator, ast.Attribute):
+        return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}'
+    elif isinstance(decorator, ast.Name):
+        return decorator.id
+    elif isinstance(decorator, ast.Call):
+        return _get_full_decorator_path(decorator.func)
+    raise ValueError(f'Unknown decorator type: {type(decorator)}')
+
+
+def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
+    """Extract test functions and filter clouds with pytest.mark
+    from a Python test file."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        tree = ast.parse(file.read(), filename=file_path)
+
+    for node in ast.walk(tree):
+        for child in ast.iter_child_nodes(node):
+            setattr(child, 'parent', node)
+
+    function_cloud_map = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
+            class_name = None
+            if hasattr(node, 'parent') and isinstance(node.parent,
+                                                      ast.ClassDef):
+                class_name = node.parent.name
+
+            clouds_to_include = []
+            clouds_to_exclude = []
+            for decorator in node.decorator_list:
+                if isinstance(decorator, ast.Call):
+                    # We only need to consider the decorator with no arguments
+                    # to extract clouds.
+                    continue
+                full_path = _get_full_decorator_path(decorator)
+                if full_path.startswith('pytest.mark.'):
+                    assert isinstance(decorator, ast.Attribute)
+                    suffix = decorator.attr
+                    if suffix.startswith('no_'):
+                        clouds_to_exclude.append(suffix[3:])
+                    else:
+                        clouds_to_include.append(suffix)
+            clouds_to_include = (clouds_to_include if clouds_to_include else
+                                 copy.deepcopy(DEFAULT_CLOUDS_TO_RUN))
+            clouds_to_include = [
+                cloud for cloud in clouds_to_include
+                if cloud not in clouds_to_exclude
+            ]
+            final_clouds_to_include = [
+                cloud for cloud in clouds_to_include
+                if cloud in ALL_CLOUDS_WITH_CREDENTIALS
+            ]
+            if clouds_to_include and not final_clouds_to_include:
+                print(f'Warning: {file_path}:{node.name} '
+                      f'is marked to run on {clouds_to_include}, '
+                      f'but we do not have credentials for those clouds. '
+                      f'Skipped.')
+                continue
+            function_name = (f'{class_name}::{node.name}'
+                             if class_name else node.name)
+            function_cloud_map[function_name] = (clouds_to_include)
+    return function_cloud_map
+
+
+def _generate_pipeline(test_file: str) -> Dict[str, Any]:
+    """Generate a Buildkite pipeline from test files."""
+    steps = []
+    function_cloud_map = _extract_marked_tests(test_file)
+    for test_function, clouds in function_cloud_map.items():
+        for cloud in clouds:
+            step = {
+                'label': f'{test_function} on {cloud}',
+                'command': f'pytest {test_file}::{test_function} --{cloud}',
+                'env': {
+                    'LOG_TO_STDOUT': '1'
+                }
+            }
+            steps.append(step)
+            # we only run one cloud per test function for now
+            break
+    return {'steps': steps}
+
+
+def main():
+    # List of test files to include in the pipeline
+    test_files = os.listdir('tests/smoke_tests')
+
+    for test_file in test_files:
+        if not test_file.startswith('test_'):
+            continue
+        test_file_path = os.path.join('tests/smoke_tests', test_file)
+        pipeline = _generate_pipeline(test_file_path)
+        yaml_file_path = '.buildkite/pipeline_smoke_' + \
+            f'{test_file.split(".")[0]}.yaml'
+        with open(yaml_file_path, 'w', encoding='utf-8') as file:
+            file.write('# This is an auto-generated Buildkite pipeline by '
+                       '.buildkite/generate_pipeline.py, Please do not '
+                       'edit directly.\n')
+            yaml.dump(pipeline, file, default_flow_style=False)
+        print(f'Convert {test_file_path} to {yaml_file_path}\n\n')
+
+
+if __name__ == '__main__':
+    main()

From a2b04154b8f506b9aa1899b17673772aa83cdd4c Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 27 Nov 2024 16:34:18 +0800
Subject: [PATCH 37/64] robust generate pipeline

---
 .buildkite/generate_pipeline.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index cb135b41a61..56f65d21460 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -12,6 +12,11 @@
 # we currently ignore them.
 ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp']
 
+ALL_CLOUDS_IN_SMOKE_TESTS = [
+    'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
+    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace'
+]
+
 
 def _get_full_decorator_path(decorator: ast.AST) -> str:
     """Recursively get the full path of a decorator."""
@@ -56,6 +61,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                     if suffix.startswith('no_'):
                         clouds_to_exclude.append(suffix[3:])
                     else:
+                        if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS:
+                            # This mark does not specify a cloud, so we skip it.
+                            continue
                         clouds_to_include.append(suffix)
             clouds_to_include = (clouds_to_include if clouds_to_include else
                                  copy.deepcopy(DEFAULT_CLOUDS_TO_RUN))

From e764192f9c2f19b5b0aface270d8faa859bba3ce Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 28 Nov 2024 16:41:32 +0800
Subject: [PATCH 38/64] refactor pipeline

---
 .buildkite/generate_pipeline.py               |  24 +-
 .buildkite/pipeline_smoke_test_basic.yaml     |  82 --
 .../pipeline_smoke_test_cluster_job.yaml      | 178 ----
 .buildkite/pipeline_smoke_test_images.yaml    |  66 --
 .../pipeline_smoke_test_managed_job.yaml      |  46 -
 ...pipeline_smoke_test_mount_and_storage.yaml | 114 ---
 .../pipeline_smoke_test_region_and_zone.yaml  |  28 -
 ...line_smoke_test_required_before_merge.yaml |   7 -
 .buildkite/pipeline_smoke_test_sky_serve.yaml |  33 -
 .../pipeline_smoke_tests_pre_merge.yaml       |   8 +
 .buildkite/pipeline_smoke_tests_release.yaml  | 874 ++++++++++++++++++
 11 files changed, 900 insertions(+), 560 deletions(-)
 delete mode 100644 .buildkite/pipeline_smoke_test_basic.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_cluster_job.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_images.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_managed_job.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_mount_and_storage.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_region_and_zone.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_required_before_merge.yaml
 delete mode 100644 .buildkite/pipeline_smoke_test_sky_serve.yaml
 create mode 100644 .buildkite/pipeline_smoke_tests_pre_merge.yaml
 create mode 100644 .buildkite/pipeline_smoke_tests_release.yaml

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 56f65d21460..6d88e2d48d2 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -1,5 +1,6 @@
 """This script generates a Buildkite pipeline from test files."""
 import ast
+from collections import defaultdict
 import copy
 import os
 from typing import Any, Dict, List
@@ -10,7 +11,7 @@
 # We only have credentials for aws, azure, and gcp.
 # For those test cases that run on other clouds,
 # we currently ignore them.
-ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp']
+ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp', 'kubernetes']
 
 ALL_CLOUDS_IN_SMOKE_TESTS = [
     'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
@@ -97,7 +98,8 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
                 'label': f'{test_function} on {cloud}',
                 'command': f'pytest {test_file}::{test_function} --{cloud}',
                 'env': {
-                    'LOG_TO_STDOUT': '1'
+                    'LOG_TO_STDOUT': '1',
+                    'PYTHONPATH': '${PYTHONPATH}:$(pwd)'
                 }
             }
             steps.append(step)
@@ -109,20 +111,30 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
 def main():
     # List of test files to include in the pipeline
     test_files = os.listdir('tests/smoke_tests')
+    output_file_pipelines_map = defaultdict(list)
 
     for test_file in test_files:
         if not test_file.startswith('test_'):
             continue
         test_file_path = os.path.join('tests/smoke_tests', test_file)
+        if test_file == 'test_required_before_merge.py':
+            yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
+        else:
+            yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
+        print(f'Converting {test_file_path} to {yaml_file_path}')
         pipeline = _generate_pipeline(test_file_path)
-        yaml_file_path = '.buildkite/pipeline_smoke_' + \
-            f'{test_file.split(".")[0]}.yaml'
+        output_file_pipelines_map[yaml_file_path].append(pipeline)
+        print(f'Converted {test_file_path} to {yaml_file_path}\n\n')
+
+    for yaml_file_path, pipelines in output_file_pipelines_map.items():
         with open(yaml_file_path, 'w', encoding='utf-8') as file:
             file.write('# This is an auto-generated Buildkite pipeline by '
                        '.buildkite/generate_pipeline.py, Please do not '
                        'edit directly.\n')
-            yaml.dump(pipeline, file, default_flow_style=False)
-        print(f'Convert {test_file_path} to {yaml_file_path}\n\n')
+            final_pipeline = {
+                'steps': [pipeline['steps'] for pipeline in pipelines]
+            }
+            yaml.dump(final_pipeline, file, default_flow_style=False)
 
 
 if __name__ == '__main__':
diff --git a/.buildkite/pipeline_smoke_test_basic.yaml b/.buildkite/pipeline_smoke_test_basic.yaml
deleted file mode 100644
index d0ba641c48c..00000000000
--- a/.buildkite/pipeline_smoke_test_basic.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_example_app on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_minimal on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_launch_fast on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_launch_fast_with_autostop on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_stale_job on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_stale_job_manual_restart on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_stale_job_manual_restart on gcp
-- command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_env_check on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cli_logs on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_core_api_sky_launch_exec on gcp
-- command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_core_api_sky_launch_fast on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multiple_accelerators_ordered on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multiple_accelerators_ordered_with_default on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multiple_accelerators_unordered on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multiple_accelerators_unordered_with_default on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multiple_resources on aws
-- command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_sky_bench on aws
-- command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
diff --git a/.buildkite/pipeline_smoke_test_cluster_job.yaml b/.buildkite/pipeline_smoke_test_cluster_job.yaml
deleted file mode 100644
index 8a813119eb2..00000000000
--- a/.buildkite/pipeline_smoke_test_cluster_job.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_job_queue on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_job_queue_with_docker on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_job_queue_multinode on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_large_job_queue on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_fast_large_job_queue on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_docker_preinstalled_package on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multi_echo on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_huggingface on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_inferentia on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_tpu on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_tpu_vm on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_tpu_vm_pod on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multi_hostname on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_multi_node_failure on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_http_server_with_custom_ports on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_http_server_with_custom_ports on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
-    --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_http_server_with_custom_ports on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_task_labels_aws on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_task_labels_gcp on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_distributed_tf on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_start_stop on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_start_stop on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_autostop on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_autodown on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cancel_aws on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cancel_gcp on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cancel_azure on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_cancel_pytorch on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_use_spot on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_stop_gcp_spot on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_inline_env on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_inline_env_file on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_custom_image on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-    --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_start_stop_two_nodes on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_disk_tier on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_disk_tier on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_disk_tier on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
-    --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_best_tier_failover on azure
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_zero_quota_failover on aws
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_zero_quota_failover on gcp
-- command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_long_setup_run_script on aws
diff --git a/.buildkite/pipeline_smoke_test_images.yaml b/.buildkite/pipeline_smoke_test_images.yaml
deleted file mode 100644
index 4991fccbbc7..00000000000
--- a/.buildkite/pipeline_smoke_test_images.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_images on aws
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_images on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_images on azure
-- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_image_id_dict on aws
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_image_id_dict on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_image_id_dict_region on aws
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_image_id_dict_region on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_image_id_dict_zone on aws
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_image_id_dict_zone on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_clone_disk_aws on aws
-- command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_clone_disk_gcp on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_mig on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_force_enable_external_ips on gcp
-- command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_image_no_conda on aws
-- command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_custom_default_conda_env on aws
diff --git a/.buildkite/pipeline_smoke_test_managed_job.yaml b/.buildkite/pipeline_smoke_test_managed_job.yaml
deleted file mode 100644
index fee2ae1f3c8..00000000000
--- a/.buildkite/pipeline_smoke_test_managed_job.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_recovery_aws on aws
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_recovery_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_pipeline_recovery_aws on aws
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_pipeline_recovery_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_recovery_multi_node_aws on aws
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_recovery_multi_node_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_cancellation_aws on aws
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_cancellation_gcp on gcp
-- command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_managed_jobs_tpu on gcp
diff --git a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml b/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
deleted file mode 100644
index 01f8739dd79..00000000000
--- a/.buildkite/pipeline_smoke_test_mount_and_storage.yaml
+++ /dev/null
@@ -1,114 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_file_mounts on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_using_file_mounts_with_env_vars on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_storage_mounts_with_stop on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_storage_mounts_with_stop on gcp
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-    --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_storage_mounts_with_stop on azure
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_docker_storage_mounts on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
-    aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_public_bucket on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_private_bucket on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_list_source on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_invalid_names on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_aws_regions on aws
-- command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: TestStorageWithCredentials::test_gcs_regions on aws
diff --git a/.buildkite/pipeline_smoke_test_region_and_zone.yaml b/.buildkite/pipeline_smoke_test_region_and_zone.yaml
deleted file mode 100644
index aa955bc1864..00000000000
--- a/.buildkite/pipeline_smoke_test_region_and_zone.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_region on aws
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_with_ssh_proxy_command on aws
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_region_and_service_account on gcp
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_azure_region on azure
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_aws_zone on aws
-- command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_gcp_zone on gcp
diff --git a/.buildkite/pipeline_smoke_test_required_before_merge.yaml b/.buildkite/pipeline_smoke_test_required_before_merge.yaml
deleted file mode 100644
index 8a29f838e4e..00000000000
--- a/.buildkite/pipeline_smoke_test_required_before_merge.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_test_sky_serve.yaml b/.buildkite/pipeline_smoke_test_sky_serve.yaml
deleted file mode 100644
index 4cd4d35aa4d..00000000000
--- a/.buildkite/pipeline_smoke_test_sky_serve.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-steps:
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_gcp_http on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_aws_http on aws
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_azure_http on azure
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_spot_recovery on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_dynamic_ondemand_fallback on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-    --gcp
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_skyserve_auto_restart on gcp
-- command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-  env:
-    LOG_TO_STDOUT: '1'
-  label: test_user_dependencies on aws
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
new file mode 100644
index 00000000000..a1f68140299
--- /dev/null
+++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
@@ -0,0 +1,8 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
new file mode 100644
index 00000000000..6a3ec46d52d
--- /dev/null
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -0,0 +1,874 @@
+# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+steps:
+- - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_example_app on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_minimal on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_launch_fast on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_launch_fast_with_autostop on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_stale_job on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_stale_job_manual_restart on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_stale_job_manual_restart on gcp
+  - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_env_check on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_cli_logs on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_core_api_sky_launch_exec on gcp
+  - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_core_api_sky_launch_fast on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multiple_accelerators_ordered on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multiple_accelerators_ordered_with_default on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multiple_accelerators_unordered on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multiple_accelerators_unordered_with_default on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multiple_resources on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_sky_bench on aws
+  - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_kubernetes_context_failover on kubernetes
+  - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
+- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_job_queue_with_docker on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_lambda_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_job_queue_multinode on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_large_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_fast_large_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_docker_preinstalled_package on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multi_echo on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_huggingface on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_lambda_huggingface on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_inferentia on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_tpu on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_tpu_vm on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_tpu_vm_pod on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_tpu_pod_slice_gke on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multi_hostname on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_multi_node_failure on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_http_server_with_custom_ports on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_http_server_with_custom_ports on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+      --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_http_server_with_custom_ports on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_kubernetes_http_server_with_custom_ports on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_runpod_http_server_with_custom_ports on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_task_labels_aws on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_task_labels_gcp on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_task_labels_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_container_logs_multinode_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_container_logs_two_jobs_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_distributed_tf on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_start_stop on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_start_stop on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_autostop on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_autodown on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_cancel_aws on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_cancel_gcp on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_cancel_azure on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_cancel_pytorch on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_use_spot on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_stop_gcp_spot on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_inline_env on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_inline_env_file on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_custom_image on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_kubernetes_custom_image on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+      --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_start_stop_two_nodes on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_disk_tier on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_disk_tier on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_disk_tier on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+      --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_best_tier_failover on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_zero_quota_failover on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_zero_quota_failover on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_long_setup_run_script on aws
+- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_job_pipeline on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_failed_setup on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_pipeline_failed_setup on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_recovery_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_recovery_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_pipeline_recovery_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_pipeline_recovery_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_recovery_default_resources on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_recovery_multi_node_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_recovery_multi_node_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_cancellation_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_cancellation_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_storage on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_tpu on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_managed_jobs_inline_env on aws
+- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_images on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_images on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_images on azure
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict_region on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict_region on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict_zone on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict_zone on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_clone_disk_aws on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_clone_disk_gcp on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_mig on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_force_enable_external_ips on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_image_no_conda on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_custom_default_conda_env on aws
+- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_gcp_http on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_aws_http on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
+      --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_azure_http on azure
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_kubernetes_http on kubernetes
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_llm on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_spot_recovery on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_base_ondemand_fallback on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_dynamic_ondemand_fallback on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_user_bug_restart on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_load_balancer on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_auto_restart on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_cancel on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_streaming on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_readiness_timeout_fail on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_large_readiness_timeout on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_rolling_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_fast_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_update_autoscale on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_new_autoscaler_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_skyserve_failures on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_user_dependencies on aws
+- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_file_mounts on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_using_file_mounts_with_env_vars on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_storage_mounts_with_stop on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_storage_mounts_with_stop on gcp
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+      --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_storage_mounts_with_stop on azure
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_kubernetes_storage_mounts on kubernetes
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+      --kubernetes
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_kubernetes_context_switch on kubernetes
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_docker_storage_mounts on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_public_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_private_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_list_source on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_invalid_names on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_aws_regions on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: TestStorageWithCredentials::test_gcs_regions on aws
+- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_region on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_with_ssh_proxy_command on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_region_and_service_account on gcp
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_region on azure
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_zone on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_zone on gcp

From a63a8c9ed4d1ef193b776a76d73a911435ce92d4 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 28 Nov 2024 16:54:52 +0800
Subject: [PATCH 39/64] remove runpod

---
 .buildkite/generate_pipeline.py              | 2 +-
 .buildkite/pipeline_smoke_tests_release.yaml | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 6d88e2d48d2..721af70437f 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -15,7 +15,7 @@
 
 ALL_CLOUDS_IN_SMOKE_TESTS = [
     'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
-    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace'
+    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod'
 ]
 
 
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index 6a3ec46d52d..65c38dfe774 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -222,12 +222,6 @@ steps:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
     label: test_kubernetes_http_server_with_custom_ports on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_runpod_http_server_with_custom_ports
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_runpod_http_server_with_custom_ports on aws
   - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
     env:
       LOG_TO_STDOUT: '1'

From 7f75f9f919d66b186217cb3fae6e2ff94fc15900 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 14:02:07 +0800
Subject: [PATCH 40/64] hot fix to pass smoke test

---
 tests/smoke_tests/test_managed_job.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index e8d13c21354..e0d2a6a619b 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -710,6 +710,8 @@ def test_managed_jobs_storage(generic_cloud: str):
                     job_name=name,
                     job_status=[ManagedJobStatus.SUCCEEDED],
                     timeout=60 + BUMP_UP_SECONDS),
+                # Wait for s3 backend refresh
+                f'sleep {BUMP_UP_SECONDS}',
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
                 output_check_cmd

From 64f928288f6ebc71b8f8b0e7ccdcac82a39c400f Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 14:10:41 +0800
Subject: [PATCH 41/64] random order

---
 .buildkite/generate_pipeline.py              |   7 +-
 .buildkite/pipeline_smoke_tests_release.yaml | 748 +++++++++----------
 2 files changed, 380 insertions(+), 375 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 721af70437f..a6aa1a025b7 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 import copy
 import os
+import random
 from typing import Any, Dict, List
 
 import yaml
@@ -131,8 +132,12 @@ def main():
             file.write('# This is an auto-generated Buildkite pipeline by '
                        '.buildkite/generate_pipeline.py, Please do not '
                        'edit directly.\n')
+            all_steps = [pipeline['steps'] for pipeline in pipelines]
+            # Shuffle the steps to avoid flakyness, consecutive runs of the same
+            # kind of test may fail for requiring locks on the same resources.
+            random.shuffle(all_steps)
             final_pipeline = {
-                'steps': [pipeline['steps'] for pipeline in pipelines]
+                'steps': all_steps
             }
             yaml.dump(final_pipeline, file, default_flow_style=False)
 
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index 65c38dfe774..59ad47bfc27 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -1,5 +1,86 @@
 # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
 steps:
+- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_images on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_images on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_azure_images on azure
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict_region on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict_region on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_aws_image_id_dict_zone on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_image_id_dict_zone on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_clone_disk_aws on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_clone_disk_gcp on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_mig on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_force_enable_external_ips on gcp
+  - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_image_no_conda on aws
+  - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+      --aws
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_custom_default_conda_env on aws
 - - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
     env:
       LOG_TO_STDOUT: '1'
@@ -106,763 +187,682 @@ steps:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
     label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
-- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
-      --aws
+    label: test_managed_jobs on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_job_queue_with_docker on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
+    label: test_job_pipeline on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_lambda_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+    label: test_managed_jobs_failed_setup on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_job_queue_multinode on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+    label: test_managed_jobs_pipeline_failed_setup on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_large_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
-      --aws
+    label: test_managed_jobs_recovery_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_fast_large_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
+    label: test_managed_jobs_recovery_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_docker_preinstalled_package on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+    label: test_managed_jobs_pipeline_recovery_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_multi_echo on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+    label: test_managed_jobs_pipeline_recovery_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_huggingface on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
+    label: test_managed_jobs_recovery_default_resources on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_lambda_huggingface on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+    label: test_managed_jobs_recovery_multi_node_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_inferentia on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+    label: test_managed_jobs_recovery_multi_node_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_tpu on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+    label: test_managed_jobs_cancellation_aws on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_tpu_vm on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+    label: test_managed_jobs_cancellation_gcp on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_tpu_vm_pod on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
-      --kubernetes
+    label: test_managed_jobs_storage on aws
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_tpu_pod_slice_gke on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+    label: test_managed_jobs_tpu on gcp
+  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_multi_hostname on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
+    label: test_managed_jobs_inline_env on aws
+- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_multi_node_failure on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-      --gcp
+    label: test_file_mounts on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_http_server_with_custom_ports on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+    label: test_using_file_mounts_with_env_vars on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_http_server_with_custom_ports on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+    label: test_aws_storage_mounts_with_stop on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+      --gcp
+    env:
+      LOG_TO_STDOUT: '1'
+      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    label: test_gcp_storage_mounts_with_stop on gcp
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
       --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_http_server_with_custom_ports on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+    label: test_azure_storage_mounts_with_stop on azure
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
       --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_http_server_with_custom_ports on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+    label: test_kubernetes_storage_mounts on kubernetes
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_task_labels_aws on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+    label: test_kubernetes_context_switch on kubernetes
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_task_labels_gcp on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
-      --kubernetes
+    label: test_docker_storage_mounts on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_task_labels_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
-      --kubernetes
+    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-      --kubernetes
+    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
-      --kubernetes
+    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_container_logs_multinode_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-      --kubernetes
+    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_container_logs_two_jobs_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-      --kubernetes
+    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+    label: TestStorageWithCredentials::test_public_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_distributed_tf on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_start_stop on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+    label: TestStorageWithCredentials::test_private_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_start_stop on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_autostop on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_autodown on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_cancel_aws on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_cancel_gcp on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_cancel_azure on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_cancel_pytorch on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_use_spot on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_stop_gcp_spot on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_inline_env on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_inline_env_file on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_custom_image on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
-      --kubernetes
+    label: TestStorageWithCredentials::test_list_source on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_custom_image on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-      --azure
+    label: TestStorageWithCredentials::test_invalid_names on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_start_stop_two_nodes on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_disk_tier on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_disk_tier on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+    label: TestStorageWithCredentials::test_aws_regions on aws
+  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_disk_tier on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
-      --azure
+    label: TestStorageWithCredentials::test_gcs_regions on aws
+- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_best_tier_failover on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+    label: test_aws_region on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_zero_quota_failover on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+    label: test_aws_with_ssh_proxy_command on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
       --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_zero_quota_failover on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_long_setup_run_script on aws
-- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+    label: test_gcp_region_and_service_account on gcp
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
+    label: test_azure_region on azure
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_job_pipeline on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
-      --aws
+    label: test_aws_zone on aws
+  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_failed_setup on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
-      --aws
+    label: test_gcp_zone on gcp
+- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_pipeline_failed_setup on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+    label: test_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_recovery_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
-      --gcp
+    label: test_job_queue_with_docker on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_recovery_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+    label: test_lambda_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_pipeline_recovery_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
-      --gcp
+    label: test_job_queue_multinode on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_pipeline_recovery_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+    label: test_large_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_recovery_default_resources on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+    label: test_fast_large_job_queue on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_recovery_multi_node_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_recovery_multi_node_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
-      --aws
+    label: test_docker_preinstalled_package on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_cancellation_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
-      --gcp
+    label: test_multi_echo on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_cancellation_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+    label: test_huggingface on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_storage on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_tpu on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
-      --aws
+    label: test_lambda_huggingface on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_managed_jobs_inline_env on aws
-- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+    label: test_inferentia on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_images on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+    label: test_tpu on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_images on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+    label: test_tpu_vm on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_images on azure
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+    label: test_tpu_vm_pod on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_image_id_dict on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+    label: test_tpu_pod_slice_gke on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_image_id_dict on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+    label: test_multi_hostname on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_image_id_dict_region on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+    label: test_multi_node_failure on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
       --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_image_id_dict_region on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
+    label: test_gcp_http_server_with_custom_ports on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_image_id_dict_zone on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_image_id_dict_zone on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+    label: test_aws_http_server_with_custom_ports on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+      --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_clone_disk_aws on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+    label: test_azure_http_server_with_custom_ports on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_clone_disk_gcp on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+    label: test_kubernetes_http_server_with_custom_ports on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_mig on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-      --gcp
+    label: test_task_labels_aws on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_force_enable_external_ips on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+    label: test_task_labels_gcp on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_image_no_conda on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
-      --aws
+    label: test_task_labels_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_custom_default_conda_env on aws
-- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+    label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_gcp_http on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+    label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_aws_http on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
-      --azure
+    label: test_container_logs_multinode_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_azure_http on azure
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+    label: test_container_logs_two_jobs_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
       --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_kubernetes_http on kubernetes
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+    label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_llm on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-      --gcp
+    label: test_distributed_tf on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_spot_recovery on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
-      --aws
+    label: test_gcp_start_stop on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_base_ondemand_fallback on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-      --gcp
+    label: test_azure_start_stop on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_dynamic_ondemand_fallback on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-      --aws
+    label: test_autostop on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_user_bug_restart on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
-      --aws
+    label: test_autodown on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_load_balancer on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-      --gcp
+    label: test_cancel_aws on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_auto_restart on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+    label: test_cancel_gcp on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_cancel on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+    label: test_cancel_azure on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_streaming on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
-      --aws
+    label: test_cancel_pytorch on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_readiness_timeout_fail on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-      --aws
+    label: test_use_spot on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_large_readiness_timeout on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
+    label: test_stop_gcp_spot on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-      --aws
+    label: test_inline_env on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_rolling_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
-      --aws
+    label: test_inline_env_file on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_fast_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-      --aws
+    label: test_aws_custom_image on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_update_autoscale on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-      --aws
+    label: test_kubernetes_custom_image on kubernetes
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+      --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_new_autoscaler_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
+    label: test_azure_start_stop_two_nodes on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_failures on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+    label: test_aws_disk_tier on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_user_dependencies on aws
-- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
-      --aws
+    label: test_gcp_disk_tier on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_file_mounts on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
-      --aws
+    label: test_azure_disk_tier on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+      --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_using_file_mounts_with_env_vars on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+    label: test_azure_best_tier_failover on azure
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_storage_mounts_with_stop on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+    label: test_aws_zero_quota_failover on aws
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
       --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_storage_mounts_with_stop on gcp
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-      --azure
+    label: test_gcp_zero_quota_failover on gcp
+  - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+      --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_storage_mounts_with_stop on azure
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-      --kubernetes
+    label: test_long_setup_run_script on aws
+- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_storage_mounts on kubernetes
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-      --kubernetes
+    label: test_skyserve_gcp_http on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_context_switch on kubernetes
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-      --aws
+    label: test_skyserve_aws_http on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
+      --azure
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_docker_storage_mounts on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-      --aws
+    label: test_skyserve_azure_http on azure
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+      --kubernetes
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      --aws
+    label: test_skyserve_kubernetes_http on kubernetes
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-      --aws
+    label: test_skyserve_llm on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+    label: test_skyserve_spot_recovery on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
-      --aws
+    label: test_skyserve_base_ondemand_fallback on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+    label: test_skyserve_dynamic_ondemand_fallback on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_public_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+    label: test_skyserve_user_bug_restart on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-      --aws
+    label: test_skyserve_load_balancer on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+      --gcp
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_private_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
-      --aws
+    label: test_skyserve_auto_restart on gcp
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-      --aws
+    label: test_skyserve_cancel on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+    label: test_skyserve_streaming on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_list_source on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+    label: test_skyserve_readiness_timeout_fail on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_invalid_names on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      --aws
+    label: test_skyserve_large_readiness_timeout on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    label: test_skyserve_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+    label: test_skyserve_rolling_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_aws_regions on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+    label: test_skyserve_fast_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_gcs_regions on aws
-- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_region on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+    label: test_skyserve_update_autoscale on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
       --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_with_ssh_proxy_command on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_region_and_service_account on gcp
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_region on azure
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+    label: test_skyserve_new_autoscaler_update on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_zone on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+    label: test_skyserve_failures on aws
+  - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
     env:
       LOG_TO_STDOUT: '1'
       PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_zone on gcp
+    label: test_user_dependencies on aws

From 543ced443adb5c668f34cbad9316f49fc651f50b Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 21:46:24 +0800
Subject: [PATCH 42/64] allow parameter

---
 .buildkite/generate_pipeline.py               |  121 +-
 .../pipeline_smoke_tests_pre_merge.yaml       |   17 +-
 .buildkite/pipeline_smoke_tests_release.yaml  | 1429 +++++++++--------
 3 files changed, 816 insertions(+), 751 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index a6aa1a025b7..5b1aded60fd 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -4,22 +4,34 @@
 import copy
 import os
 import random
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import yaml
 
 DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
-# We only have credentials for aws, azure, and gcp.
-# For those test cases that run on other clouds,
-# we currently ignore them.
-ALL_CLOUDS_WITH_CREDENTIALS = ['aws', 'azure', 'gcp', 'kubernetes']
 
 ALL_CLOUDS_IN_SMOKE_TESTS = [
     'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
     'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod'
 ]
-
-
+QUEUE_GENERIC_CLOUD = 'generic_cloud'
+QUEUE_KUBERNETES = 'kubernetes'
+# Only aws, gcp, azure, and kubernetes are supported for now.
+# Other clouds do not have credentials.
+CLOUD_QUEUE_MAP = {
+    'aws': QUEUE_GENERIC_CLOUD,
+    'gcp': QUEUE_GENERIC_CLOUD,
+    'azure': QUEUE_GENERIC_CLOUD,
+    'kubernetes': QUEUE_KUBERNETES
+}
+
+GENERATED_FILE_HEAD = (
+    '# This is an auto-generated Buildkite pipeline by '
+    '.buildkite/generate_pipeline.py, Please do not '
+    'edit directly.\n'
+)
+
+    
 def _get_full_decorator_path(decorator: ast.AST) -> str:
     """Recursively get the full path of a decorator."""
     if isinstance(decorator, ast.Attribute):
@@ -75,7 +87,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
             ]
             final_clouds_to_include = [
                 cloud for cloud in clouds_to_include
-                if cloud in ALL_CLOUDS_WITH_CREDENTIALS
+                if cloud in CLOUD_QUEUE_MAP
             ]
             if clouds_to_include and not final_clouds_to_include:
                 print(f'Warning: {file_path}:{node.name} '
@@ -89,7 +101,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
     return function_cloud_map
 
 
-def _generate_pipeline(test_file: str) -> Dict[str, Any]:
+def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
     function_cloud_map = _extract_marked_tests(test_file)
@@ -98,49 +110,86 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
             step = {
                 'label': f'{test_function} on {cloud}',
                 'command': f'pytest {test_file}::{test_function} --{cloud}',
-                'env': {
-                    'LOG_TO_STDOUT': '1',
-                    'PYTHONPATH': '${PYTHONPATH}:$(pwd)'
-                }
+                'agents': {
+                    # Separate agent pool for each cloud.
+                    # Since some are more costly
+                    'queue': CLOUD_QUEUE_MAP[cloud]
+                },
+                'if': f'build.env.{cloud} == \'1\''
             }
             steps.append(step)
-            # we only run one cloud per test function for now
-            break
+            if one_cloud_per_test_function:
+                break
     return {'steps': steps}
 
 
-def main():
-    # List of test files to include in the pipeline
-    test_files = os.listdir('tests/smoke_tests')
-    output_file_pipelines_map = defaultdict(list)
-
-    for test_file in test_files:
-        if not test_file.startswith('test_'):
-            continue
-        test_file_path = os.path.join('tests/smoke_tests', test_file)
-        if test_file == 'test_required_before_merge.py':
-            yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
-        else:
-            yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
-        print(f'Converting {test_file_path} to {yaml_file_path}')
-        pipeline = _generate_pipeline(test_file_path)
-        output_file_pipelines_map[yaml_file_path].append(pipeline)
-        print(f'Converted {test_file_path} to {yaml_file_path}\n\n')
+def _dump_pipeline_to_file(
+        output_file_pipelines_map: Dict[str, List[Dict[str, Any]]],
+        extra_env: Optional[Dict[str, str]] = None):
+    default_env = {
+        'LOG_TO_STDOUT': '1',
+        'PYTHONPATH': '${PYTHONPATH}:$(pwd)'
+    }
+    if extra_env:
+        default_env.update(extra_env)
 
     for yaml_file_path, pipelines in output_file_pipelines_map.items():
         with open(yaml_file_path, 'w', encoding='utf-8') as file:
-            file.write('# This is an auto-generated Buildkite pipeline by '
-                       '.buildkite/generate_pipeline.py, Please do not '
-                       'edit directly.\n')
+            file.write(GENERATED_FILE_HEAD)
             all_steps = [pipeline['steps'] for pipeline in pipelines]
             # Shuffle the steps to avoid flakyness, consecutive runs of the same
             # kind of test may fail for requiring locks on the same resources.
             random.shuffle(all_steps)
             final_pipeline = {
-                'steps': all_steps
+                'steps': all_steps,
+                'env': default_env
             }
             yaml.dump(final_pipeline, file, default_flow_style=False)
 
+def _convert_release(test_files: List[str]):
+    yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
+    output_file_pipelines_map = defaultdict(list)
+    for test_file in test_files:
+        print(f'Converting {test_file} to {yaml_file_path}')
+        # We only need to run one cloud per test function.
+        pipeline = _generate_pipeline(test_file, True)
+        output_file_pipelines_map[yaml_file_path].append(pipeline)
+        print(f'Converted {test_file} to {yaml_file_path}\n\n')
+    # Enable all clouds by default for release pipeline.
+    _dump_pipeline_to_file(output_file_pipelines_map, extra_env={
+        cloud: '1' for cloud in CLOUD_QUEUE_MAP
+    })
+
+
+def _convert_pre_merge(test_files: List[str]):
+    yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
+    output_file_pipelines_map = defaultdict(list)
+    for test_file in test_files:
+        print(f'Converting {test_file} to {yaml_file_path}')
+        # We want enable all clouds by default for each test function 
+        # for pre-merge. And let the author controls which clouds 
+        # to run by parameter.
+        pipeline = _generate_pipeline(test_file, False)
+        output_file_pipelines_map[yaml_file_path].append(pipeline)
+        print(f'Converted {test_file} to {yaml_file_path}\n\n')
+    _dump_pipeline_to_file(output_file_pipelines_map)
+
+def main():
+    test_files = os.listdir('tests/smoke_tests')
+    release_files = []
+    pre_merge_files = []
+    for test_file in test_files:
+        if not test_file.startswith('test_'):
+            continue
+        test_file_path = os.path.join('tests/smoke_tests', test_file)
+        if "required_before_merge" in test_file:
+            pre_merge_files.append(test_file_path)
+        else:
+            release_files.append(test_file_path)
+
+    _convert_release(release_files)
+    _convert_pre_merge(pre_merge_files)
+
 
 if __name__ == '__main__':
     main()
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
index a1f68140299..be0e34876dc 100644
--- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml
+++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
@@ -1,8 +1,17 @@
 # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+env:
+  LOG_TO_STDOUT: '1'
+  PYTHONPATH: ${PYTHONPATH}:$(pwd)
 steps:
-- - command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_yaml_launch_and_mount on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+      --azure
+    if: build.env.azure == '1'
+    label: test_yaml_launch_and_mount on azure
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index 59ad47bfc27..06a4d750931 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -1,868 +1,875 @@
 # This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
+env:
+  LOG_TO_STDOUT: '1'
+  PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  aws: '1'
+  azure: '1'
+  gcp: '1'
+  kubernetes: '1'
 steps:
-- - command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
+      --aws
+    if: build.env.aws == '1'
+    label: test_file_mounts on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+      --aws
+    if: build.env.aws == '1'
+    label: test_using_file_mounts_with_env_vars on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+      --aws
+    if: build.env.aws == '1'
+    label: test_aws_storage_mounts_with_stop on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+      --gcp
+    if: build.env.gcp == '1'
+    label: test_gcp_storage_mounts_with_stop on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+      --azure
+    if: build.env.azure == '1'
+    label: test_azure_storage_mounts_with_stop on azure
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+      --kubernetes
+    if: build.env.kubernetes == '1'
+    label: test_kubernetes_storage_mounts on kubernetes
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+      --kubernetes
+    if: build.env.kubernetes == '1'
+    label: test_kubernetes_context_switch on kubernetes
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
+      --aws
+    if: build.env.aws == '1'
+    label: test_docker_storage_mounts on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+      on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_public_bucket on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_private_bucket on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_list_source on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_invalid_names on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+      on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+      on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_aws_regions on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+      --aws
+    if: build.env.aws == '1'
+    label: TestStorageWithCredentials::test_gcs_regions on aws
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+    if: build.env.gcp == '1'
+    label: test_skyserve_gcp_http on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_aws_http on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
+      --azure
+    if: build.env.azure == '1'
+    label: test_skyserve_azure_http on azure
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+      --kubernetes
+    if: build.env.kubernetes == '1'
+    label: test_skyserve_kubernetes_http on kubernetes
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_llm on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+      --gcp
+    if: build.env.gcp == '1'
+    label: test_skyserve_spot_recovery on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_base_ondemand_fallback on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+      --gcp
+    if: build.env.gcp == '1'
+    label: test_skyserve_dynamic_ondemand_fallback on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_user_bug_restart on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_load_balancer on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+      --gcp
+    if: build.env.gcp == '1'
+    label: test_skyserve_auto_restart on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_cancel on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_streaming on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_readiness_timeout_fail on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_large_readiness_timeout on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_update on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_rolling_update on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_fast_update on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_update_autoscale on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
+      --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_new_autoscaler_update on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
+    if: build.env.aws == '1'
+    label: test_skyserve_failures on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+    if: build.env.aws == '1'
+    label: test_user_dependencies on aws
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+    if: build.env.aws == '1'
     label: test_aws_images on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+    if: build.env.gcp == '1'
     label: test_gcp_images on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+    if: build.env.azure == '1'
     label: test_azure_images on azure
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+    if: build.env.aws == '1'
     label: test_aws_image_id_dict on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+    if: build.env.gcp == '1'
     label: test_gcp_image_id_dict on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_aws_image_id_dict_region on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_image_id_dict_region on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_aws_image_id_dict_zone on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_image_id_dict_zone on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+    if: build.env.aws == '1'
     label: test_clone_disk_aws on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+    if: build.env.gcp == '1'
     label: test_clone_disk_gcp on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+    if: build.env.gcp == '1'
     label: test_gcp_mig on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_force_enable_external_ips on gcp
-  - command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+    if: build.env.aws == '1'
     label: test_image_no_conda on aws
-  - command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_custom_default_conda_env on aws
-- - command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+    if: build.env.aws == '1'
     label: test_example_app on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+    if: build.env.aws == '1'
     label: test_minimal on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+    if: build.env.aws == '1'
     label: test_launch_fast on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_launch_fast_with_autostop on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+    if: build.env.aws == '1'
     label: test_stale_job on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_aws_stale_job_manual_restart on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_stale_job_manual_restart on gcp
-  - command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+    if: build.env.aws == '1'
     label: test_env_check on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+    if: build.env.aws == '1'
     label: test_cli_logs on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_core_api_sky_launch_exec on gcp
-  - command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_core_api_sky_launch_fast on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_multiple_accelerators_ordered on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_multiple_accelerators_ordered_with_default on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_multiple_accelerators_unordered on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_multiple_accelerators_unordered_with_default on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+    if: build.env.aws == '1'
     label: test_multiple_resources on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
+    if: build.env.aws == '1'
     label: test_sky_bench on aws
-  - command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_kubernetes_context_failover on kubernetes
-  - command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
-- - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+    if: build.env.aws == '1'
+    label: test_aws_region on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+      --aws
+    if: build.env.aws == '1'
+    label: test_aws_with_ssh_proxy_command on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
+      --gcp
+    if: build.env.gcp == '1'
+    label: test_gcp_region_and_service_account on gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+    if: build.env.azure == '1'
+    label: test_azure_region on azure
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+    if: build.env.aws == '1'
+    label: test_aws_zone on aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+    if: build.env.gcp == '1'
+    label: test_gcp_zone on gcp
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+    if: build.env.aws == '1'
     label: test_managed_jobs on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
+    if: build.env.aws == '1'
     label: test_job_pipeline on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_failed_setup on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_pipeline_failed_setup on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_recovery_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_managed_jobs_recovery_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_pipeline_recovery_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_managed_jobs_pipeline_recovery_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_recovery_default_resources on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_recovery_multi_node_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_managed_jobs_recovery_multi_node_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_cancellation_aws on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_managed_jobs_cancellation_gcp on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_storage on aws
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+    if: build.env.gcp == '1'
     label: test_managed_jobs_tpu on gcp
-  - command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_managed_jobs_inline_env on aws
-- - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_file_mounts on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_using_file_mounts_with_env_vars on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_storage_mounts_with_stop on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_storage_mounts_with_stop on gcp
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-      --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_storage_mounts_with_stop on azure
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-      --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_storage_mounts on kubernetes
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-      --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_kubernetes_context_switch on kubernetes
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_docker_storage_mounts on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_public_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_private_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_list_source on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_invalid_names on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-      on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_aws_regions on aws
-  - command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: TestStorageWithCredentials::test_gcs_regions on aws
-- - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_region on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_with_ssh_proxy_command on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_region_and_service_account on gcp
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_azure_region on azure
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_aws_zone on aws
-  - command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_gcp_zone on gcp
-- - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+- - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+    if: build.env.aws == '1'
     label: test_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_job_queue_with_docker on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
+    if: build.env.aws == '1'
     label: test_lambda_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_job_queue_multinode on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+    if: build.env.aws == '1'
     label: test_large_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_fast_large_job_queue on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_docker_preinstalled_package on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+    if: build.env.aws == '1'
     label: test_multi_echo on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+    if: build.env.aws == '1'
     label: test_huggingface on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_lambda_huggingface on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+    if: build.env.aws == '1'
     label: test_inferentia on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+    if: build.env.gcp == '1'
     label: test_tpu on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+    if: build.env.gcp == '1'
     label: test_tpu_vm on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+    if: build.env.gcp == '1'
     label: test_tpu_vm_pod on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_tpu_pod_slice_gke on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+    if: build.env.aws == '1'
     label: test_multi_hostname on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_multi_node_failure on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_http_server_with_custom_ports on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_aws_http_server_with_custom_ports on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
       --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.azure == '1'
     label: test_azure_http_server_with_custom_ports on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_kubernetes_http_server_with_custom_ports on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+    if: build.env.aws == '1'
     label: test_task_labels_aws on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+    if: build.env.gcp == '1'
     label: test_task_labels_gcp on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_task_labels_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_container_logs_multinode_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_container_logs_two_jobs_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+    if: build.env.aws == '1'
     label: test_distributed_tf on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+    if: build.env.gcp == '1'
     label: test_gcp_start_stop on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+    if: build.env.azure == '1'
     label: test_azure_start_stop on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+    if: build.env.aws == '1'
     label: test_autostop on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+    if: build.env.aws == '1'
     label: test_autodown on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+    if: build.env.aws == '1'
     label: test_cancel_aws on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+    if: build.env.gcp == '1'
     label: test_cancel_gcp on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+    if: build.env.azure == '1'
     label: test_cancel_azure on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
+    if: build.env.aws == '1'
     label: test_cancel_pytorch on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+    if: build.env.aws == '1'
     label: test_use_spot on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
+    if: build.env.gcp == '1'
     label: test_stop_gcp_spot on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+    if: build.env.aws == '1'
     label: test_inline_env on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+    if: build.env.aws == '1'
     label: test_inline_env_file on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+    if: build.env.aws == '1'
     label: test_aws_custom_image on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+  - agents:
+      queue: kubernetes
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
       --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.kubernetes == '1'
     label: test_kubernetes_custom_image on kubernetes
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
       --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.azure == '1'
     label: test_azure_start_stop_two_nodes on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+    if: build.env.aws == '1'
     label: test_aws_disk_tier on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+    if: build.env.gcp == '1'
     label: test_gcp_disk_tier on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+    if: build.env.azure == '1'
     label: test_azure_disk_tier on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
       --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.azure == '1'
     label: test_azure_best_tier_failover on azure
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_aws_zero_quota_failover on aws
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
       --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.gcp == '1'
     label: test_gcp_zero_quota_failover on gcp
-  - command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+  - agents:
+      queue: generic_cloud
+    command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
       --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
+    if: build.env.aws == '1'
     label: test_long_setup_run_script on aws
-- - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_gcp_http on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_aws_http on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
-      --azure
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_azure_http on azure
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
-      --kubernetes
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_kubernetes_http on kubernetes
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_llm on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_spot_recovery on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_base_ondemand_fallback on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_dynamic_ondemand_fallback on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_user_bug_restart on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_load_balancer on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-      --gcp
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_auto_restart on gcp
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_cancel on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_streaming on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_readiness_timeout_fail on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_large_readiness_timeout on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_rolling_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_fast_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_update_autoscale on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-      --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_new_autoscaler_update on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_skyserve_failures on aws
-  - command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-    env:
-      LOG_TO_STDOUT: '1'
-      PYTHONPATH: ${PYTHONPATH}:$(pwd)
-    label: test_user_dependencies on aws

From 2cff4bd74a1aef37fd08e6e812b245a9110b4bb5 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 22:51:18 +0800
Subject: [PATCH 43/64] bug fix

---
 .buildkite/generate_pipeline.py               |    4 +-
 .../pipeline_smoke_tests_pre_merge.yaml       |   24 +-
 .buildkite/pipeline_smoke_tests_release.yaml  | 1722 ++++++++---------
 3 files changed, 871 insertions(+), 879 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 5b1aded60fd..45efa758844 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -136,7 +136,9 @@ def _dump_pipeline_to_file(
     for yaml_file_path, pipelines in output_file_pipelines_map.items():
         with open(yaml_file_path, 'w', encoding='utf-8') as file:
             file.write(GENERATED_FILE_HEAD)
-            all_steps = [pipeline['steps'] for pipeline in pipelines]
+            all_steps = []
+            for pipeline in pipelines:
+                all_steps.extend(pipeline['steps'])
             # Shuffle the steps to avoid flakyness, consecutive runs of the same
             # kind of test may fail for requiring locks on the same resources.
             random.shuffle(all_steps)
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
index be0e34876dc..41d2909b1f8 100644
--- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml
+++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
@@ -3,15 +3,15 @@ env:
   LOG_TO_STDOUT: '1'
   PYTHONPATH: ${PYTHONPATH}:$(pwd)
 steps:
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-      --aws
-    if: build.env.aws == '1'
-    label: test_yaml_launch_and_mount on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-      --azure
-    if: build.env.azure == '1'
-    label: test_yaml_launch_and_mount on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+    --azure
+  if: build.env.azure == '1'
+  label: test_yaml_launch_and_mount on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+    --aws
+  if: build.env.aws == '1'
+  label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index 06a4d750931..928a79c0ded 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -7,869 +7,859 @@ env:
   gcp: '1'
   kubernetes: '1'
 steps:
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts
-      --aws
-    if: build.env.aws == '1'
-    label: test_file_mounts on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
-      --aws
-    if: build.env.aws == '1'
-    label: test_using_file_mounts_with_env_vars on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_storage_mounts_with_stop on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_storage_mounts_with_stop on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-      --azure
-    if: build.env.azure == '1'
-    label: test_azure_storage_mounts_with_stop on azure
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_kubernetes_storage_mounts on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_kubernetes_context_switch on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-      --aws
-    if: build.env.aws == '1'
-    label: test_docker_storage_mounts on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-      on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_bucket_external_deletion on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_public_bucket on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_nonexistent_bucket on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_private_bucket on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_list_source on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_invalid_names on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-      on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-      on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_aws_regions on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-      --aws
-    if: build.env.aws == '1'
-    label: TestStorageWithCredentials::test_gcs_regions on aws
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-    if: build.env.gcp == '1'
-    label: test_skyserve_gcp_http on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_aws_http on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http
-      --azure
-    if: build.env.azure == '1'
-    label: test_skyserve_azure_http on azure
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_skyserve_kubernetes_http on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_llm on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_skyserve_spot_recovery on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_base_ondemand_fallback on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_skyserve_dynamic_ondemand_fallback on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_user_bug_restart on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_load_balancer on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_skyserve_auto_restart on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_cancel on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_streaming on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_readiness_timeout_fail on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_large_readiness_timeout on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_update on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_rolling_update on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_fast_update on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_update_autoscale on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-      --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_new_autoscaler_update on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
-    if: build.env.aws == '1'
-    label: test_skyserve_failures on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-    if: build.env.aws == '1'
-    label: test_user_dependencies on aws
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
-    if: build.env.aws == '1'
-    label: test_aws_images on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_images on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-    if: build.env.azure == '1'
-    label: test_azure_images on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
-    if: build.env.aws == '1'
-    label: test_aws_image_id_dict on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_image_id_dict on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_image_id_dict_region on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_image_id_dict_region on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_image_id_dict_zone on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_image_id_dict_zone on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
-    if: build.env.aws == '1'
-    label: test_clone_disk_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
-    if: build.env.gcp == '1'
-    label: test_clone_disk_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_mig on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_force_enable_external_ips on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-    if: build.env.aws == '1'
-    label: test_image_no_conda on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
-      --aws
-    if: build.env.aws == '1'
-    label: test_custom_default_conda_env on aws
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
-    if: build.env.aws == '1'
-    label: test_example_app on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
-    if: build.env.aws == '1'
-    label: test_minimal on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
-    if: build.env.aws == '1'
-    label: test_launch_fast on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
-      --aws
-    if: build.env.aws == '1'
-    label: test_launch_fast_with_autostop on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
-    if: build.env.aws == '1'
-    label: test_stale_job on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_stale_job_manual_restart on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_stale_job_manual_restart on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
-    if: build.env.aws == '1'
-    label: test_env_check on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
-    if: build.env.aws == '1'
-    label: test_cli_logs on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_core_api_sky_launch_exec on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast
-      --aws
-    if: build.env.aws == '1'
-    label: test_core_api_sky_launch_fast on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
-      --aws
-    if: build.env.aws == '1'
-    label: test_multiple_accelerators_ordered on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
-      --aws
-    if: build.env.aws == '1'
-    label: test_multiple_accelerators_ordered_with_default on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
-      --aws
-    if: build.env.aws == '1'
-    label: test_multiple_accelerators_unordered on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
-      --aws
-    if: build.env.aws == '1'
-    label: test_multiple_accelerators_unordered_with_default on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
-    if: build.env.aws == '1'
-    label: test_multiple_resources on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-    if: build.env.aws == '1'
-    label: test_sky_bench on aws
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_kubernetes_context_failover on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
-      --aws
-    if: build.env.aws == '1'
-    label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-    if: build.env.aws == '1'
-    label: test_aws_region on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_with_ssh_proxy_command on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_region_and_service_account on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-    if: build.env.azure == '1'
-    label: test_azure_region on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
-    if: build.env.aws == '1'
-    label: test_aws_zone on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_zone on gcp
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
-    if: build.env.aws == '1'
-    label: test_job_pipeline on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_failed_setup on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_pipeline_failed_setup on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_recovery_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_managed_jobs_recovery_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_pipeline_recovery_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_managed_jobs_pipeline_recovery_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_recovery_default_resources on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_recovery_multi_node_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_managed_jobs_recovery_multi_node_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_cancellation_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_managed_jobs_cancellation_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_storage on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-    if: build.env.gcp == '1'
-    label: test_managed_jobs_tpu on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
-      --aws
-    if: build.env.aws == '1'
-    label: test_managed_jobs_inline_env on aws
-- - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-    if: build.env.aws == '1'
-    label: test_job_queue on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
-      --aws
-    if: build.env.aws == '1'
-    label: test_job_queue_with_docker on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
-    if: build.env.aws == '1'
-    label: test_lambda_job_queue on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
-      --aws
-    if: build.env.aws == '1'
-    label: test_job_queue_multinode on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
-    if: build.env.aws == '1'
-    label: test_large_job_queue on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
-      --aws
-    if: build.env.aws == '1'
-    label: test_fast_large_job_queue on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
-      --aws
-    if: build.env.aws == '1'
-    label: test_docker_preinstalled_package on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
-    if: build.env.aws == '1'
-    label: test_multi_echo on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
-    if: build.env.aws == '1'
-    label: test_huggingface on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface
-      --aws
-    if: build.env.aws == '1'
-    label: test_lambda_huggingface on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-    if: build.env.aws == '1'
-    label: test_inferentia on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
-    if: build.env.gcp == '1'
-    label: test_tpu on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-    if: build.env.gcp == '1'
-    label: test_tpu_vm on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
-    if: build.env.gcp == '1'
-    label: test_tpu_vm_pod on gcp
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_tpu_pod_slice_gke on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
-    if: build.env.aws == '1'
-    label: test_multi_hostname on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure
-      --aws
-    if: build.env.aws == '1'
-    label: test_multi_node_failure on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_http_server_with_custom_ports on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_http_server_with_custom_ports on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
-      --azure
-    if: build.env.azure == '1'
-    label: test_azure_http_server_with_custom_ports on azure
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_kubernetes_http_server_with_custom_ports on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
-    if: build.env.aws == '1'
-    label: test_task_labels_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-    if: build.env.gcp == '1'
-    label: test_task_labels_gcp on gcp
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_task_labels_kubernetes on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_container_logs_multinode_kubernetes on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_container_logs_two_jobs_kubernetes on kubernetes
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
-    if: build.env.aws == '1'
-    label: test_distributed_tf on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_start_stop on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
-    if: build.env.azure == '1'
-    label: test_azure_start_stop on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
-    if: build.env.aws == '1'
-    label: test_autostop on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
-    if: build.env.aws == '1'
-    label: test_autodown on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-    if: build.env.aws == '1'
-    label: test_cancel_aws on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-    if: build.env.gcp == '1'
-    label: test_cancel_gcp on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-    if: build.env.azure == '1'
-    label: test_cancel_azure on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-    if: build.env.aws == '1'
-    label: test_cancel_pytorch on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-    if: build.env.aws == '1'
-    label: test_use_spot on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-    if: build.env.gcp == '1'
-    label: test_stop_gcp_spot on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-    if: build.env.aws == '1'
-    label: test_inline_env on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-    if: build.env.aws == '1'
-    label: test_inline_env_file on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-    if: build.env.aws == '1'
-    label: test_aws_custom_image on aws
-  - agents:
-      queue: kubernetes
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
-      --kubernetes
-    if: build.env.kubernetes == '1'
-    label: test_kubernetes_custom_image on kubernetes
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-      --azure
-    if: build.env.azure == '1'
-    label: test_azure_start_stop_two_nodes on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
-    if: build.env.aws == '1'
-    label: test_aws_disk_tier on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_disk_tier on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-    if: build.env.azure == '1'
-    label: test_azure_disk_tier on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
-      --azure
-    if: build.env.azure == '1'
-    label: test_azure_best_tier_failover on azure
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
-      --aws
-    if: build.env.aws == '1'
-    label: test_aws_zero_quota_failover on aws
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
-      --gcp
-    if: build.env.gcp == '1'
-    label: test_gcp_zero_quota_failover on gcp
-  - agents:
-      queue: generic_cloud
-    command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
-      --aws
-    if: build.env.aws == '1'
-    label: test_long_setup_run_script on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_gcs_regions on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+  if: build.env.gcp == '1'
+  label: test_clone_disk_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+  if: build.env.azure == '1'
+  label: test_azure_region on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+  if: build.env.aws == '1'
+  label: test_aws_disk_tier on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_http_server_with_custom_ports on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+    --aws
+  if: build.env.aws == '1'
+  label: test_fast_large_job_queue on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_kubernetes_http_server_with_custom_ports on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+  if: build.env.aws == '1'
+  label: test_multi_echo on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+  if: build.env.aws == '1'
+  label: test_minimal on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+    --aws
+  if: build.env.aws == '1'
+  label: test_multiple_accelerators_unordered on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_mig on gcp
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_kubernetes_context_switch on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+  if: build.env.aws == '1'
+  label: test_autodown on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+    --aws
+  if: build.env.aws == '1'
+  label: test_multiple_accelerators_unordered_with_default on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_streaming on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_recovery_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_pipeline_recovery_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_with_ssh_proxy_command on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_skyserve_spot_recovery on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_rolling_update on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_image_id_dict_zone on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+    --aws
+  if: build.env.aws == '1'
+  label: test_multiple_accelerators_ordered on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+  if: build.env.aws == '1'
+  label: test_aws_images on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+    --azure
+  if: build.env.azure == '1'
+  label: test_azure_storage_mounts_with_stop on azure
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_region_and_service_account on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+  if: build.env.aws == '1'
+  label: test_inline_env on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+  if: build.env.azure == '1'
+  label: test_azure_disk_tier on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+    --aws
+  if: build.env.aws == '1'
+  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
+  if: build.env.gcp == '1'
+  label: test_stop_gcp_spot on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_large_readiness_timeout on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_stale_job_manual_restart on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+  if: build.env.aws == '1'
+  label: test_image_no_conda on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_storage on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_recovery_multi_node_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_stale_job_manual_restart on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_http_server_with_custom_ports on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+  if: build.env.gcp == '1'
+  label: test_cancel_gcp on gcp
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_task_labels_kubernetes on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
+  if: build.env.aws == '1'
+  label: test_core_api_sky_launch_fast on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_base_ondemand_fallback on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+  if: build.env.aws == '1'
+  label: test_clone_disk_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+  if: build.env.aws == '1'
+  label: test_autostop on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_pipeline_failed_setup on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_skyserve_dynamic_ondemand_fallback on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
+  if: build.env.aws == '1'
+  label: test_job_pipeline on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_public_bucket on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_private_bucket on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+  if: build.env.gcp == '1'
+  label: test_task_labels_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
+  if: build.env.aws == '1'
+  label: test_multi_node_failure on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
+    --aws
+  if: build.env.aws == '1'
+  label: test_docker_storage_mounts on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+  if: build.env.aws == '1'
+  label: test_huggingface on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
+  if: build.env.aws == '1'
+  label: test_file_mounts on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_zero_quota_failover on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws
+  if: build.env.aws == '1'
+  label: test_lambda_huggingface on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_image_id_dict on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_storage_mounts_with_stop on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+    --aws
+  if: build.env.aws == '1'
+  label: test_launch_fast_with_autostop on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+  if: build.env.aws == '1'
+  label: test_aws_image_id_dict on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_image_id_dict_region on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+    --azure
+  if: build.env.azure == '1'
+  label: test_azure_best_tier_failover on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_readiness_timeout_fail on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+  if: build.env.aws == '1'
+  label: test_multi_hostname on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
+    --aws
+  if: build.env.aws == '1'
+  label: test_multiple_accelerators_ordered_with_default on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
+  if: build.env.azure == '1'
+  label: test_skyserve_azure_http on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+    --aws
+  if: build.env.aws == '1'
+  label: test_job_queue_multinode on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
+    aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_fast_update on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_aws_http on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_new_autoscaler_update on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_kubernetes_storage_mounts on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+  if: build.env.azure == '1'
+  label: test_azure_images on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+  if: build.env.aws == '1'
+  label: test_use_spot on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+  if: build.env.gcp == '1'
+  label: test_managed_jobs_tpu on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_update on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_start_stop on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_user_bug_restart on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+  if: build.env.aws == '1'
+  label: test_env_check on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_skyserve_auto_restart on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_storage_mounts_with_stop on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_inline_env on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+    --aws
+  if: build.env.aws == '1'
+  label: test_using_file_mounts_with_env_vars on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+  if: build.env.aws == '1'
+  label: test_aws_custom_image on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+  if: build.env.aws == '1'
+  label: test_job_queue on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+    --aws
+  if: build.env.aws == '1'
+  label: test_long_setup_run_script on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_failures on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_kubernetes_custom_image on kubernetes
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_skyserve_kubernetes_http on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+  if: build.env.gcp == '1'
+  label: test_tpu_vm on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+  if: build.env.aws == '1'
+  label: test_inline_env_file on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_load_balancer on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_managed_jobs_pipeline_recovery_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+  if: build.env.aws == '1'
+  label: test_distributed_tf on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_failed_setup on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_managed_jobs_recovery_multi_node_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+  if: build.env.aws == '1'
+  label: test_multiple_resources on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_container_logs_two_jobs_kubernetes on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
+  if: build.env.aws == '1'
+  label: test_cancel_pytorch on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+  if: build.env.aws == '1'
+  label: test_stale_job on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_force_enable_external_ips on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+  if: build.env.azure == '1'
+  label: test_cancel_azure on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_aws_regions on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
+    --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_update_autoscale on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+    --aws
+  if: build.env.aws == '1'
+  label: test_job_queue_with_docker on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_kubernetes_context_failover on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+  if: build.env.aws == '1'
+  label: test_cli_logs on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_zero_quota_failover on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+  if: build.env.aws == '1'
+  label: test_aws_zone on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+  if: build.env.aws == '1'
+  label: test_cancel_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+    --aws
+  if: build.env.aws == '1'
+  label: test_aws_image_id_dict_region on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
+  if: build.env.aws == '1'
+  label: test_lambda_job_queue on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+    --azure
+  if: build.env.azure == '1'
+  label: test_azure_start_stop_two_nodes on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+  if: build.env.aws == '1'
+  label: test_task_labels_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
+    --aws
+  if: build.env.aws == '1'
+  label: test_docker_preinstalled_package on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+    --azure
+  if: build.env.azure == '1'
+  label: test_azure_http_server_with_custom_ports on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+  if: build.env.gcp == '1'
+  label: test_tpu_vm_pod on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+  if: build.env.aws == '1'
+  label: test_launch_fast on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+    --aws
+  if: build.env.aws == '1'
+  label: test_custom_default_conda_env on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_invalid_names on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+  if: build.env.aws == '1'
+  label: test_aws_region on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+  if: build.env.aws == '1'
+  label: test_example_app on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_cancel on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_cancellation_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+  if: build.env.aws == '1'
+  label: test_inferentia on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+  if: build.env.gcp == '1'
+  label: test_skyserve_gcp_http on gcp
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_tpu_pod_slice_gke on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_images on gcp
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+    --kubernetes
+  if: build.env.kubernetes == '1'
+  label: test_container_logs_multinode_kubernetes on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+  if: build.env.gcp == '1'
+  label: test_tpu on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_managed_jobs_cancellation_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+  if: build.env.aws == '1'
+  label: test_skyserve_llm on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
+  if: build.env.aws == '1'
+  label: test_sky_bench on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+  if: build.env.aws == '1'
+  label: test_large_job_queue on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_disk_tier on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+  if: build.env.aws == '1'
+  label: test_user_dependencies on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+    --aws
+  if: build.env.aws == '1'
+  label: TestStorageWithCredentials::test_list_source on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
+  if: build.env.gcp == '1'
+  label: test_core_api_sky_launch_exec on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
+  if: build.env.aws == '1'
+  label: test_aws_image_id_dict_zone on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+  if: build.env.gcp == '1'
+  label: test_gcp_zone on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+    --gcp
+  if: build.env.gcp == '1'
+  label: test_managed_jobs_recovery_gcp on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+  if: build.env.azure == '1'
+  label: test_azure_start_stop on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+    --aws
+  if: build.env.aws == '1'
+  label: test_managed_jobs_recovery_default_resources on aws

From 19fc691fcf72f1d9cc237c3e4136149aaa49c80d Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 23:03:07 +0800
Subject: [PATCH 44/64] bug fix

---
 .buildkite/generate_pipeline.py               |    2 +-
 .../pipeline_smoke_tests_pre_merge.yaml       |   12 +-
 .buildkite/pipeline_smoke_tests_release.yaml  | 1052 ++++++++---------
 3 files changed, 533 insertions(+), 533 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 45efa758844..3c3b9c41edf 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -115,7 +115,7 @@ def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dic
                     # Since some are more costly
                     'queue': CLOUD_QUEUE_MAP[cloud]
                 },
-                'if': f'build.env.{cloud} == \'1\''
+                'if': f'build.env("{cloud}") == "1"'
             }
             steps.append(step)
             if one_cloud_per_test_function:
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
index 41d2909b1f8..35ba7ea17ec 100644
--- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml
+++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
@@ -6,12 +6,12 @@ steps:
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --azure
-  if: build.env.azure == '1'
-  label: test_yaml_launch_and_mount on azure
+    --aws
+  if: build.env("aws") == "1"
+  label: test_yaml_launch_and_mount on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --aws
-  if: build.env.aws == '1'
-  label: test_yaml_launch_and_mount on aws
+    --azure
+  if: build.env("azure") == "1"
+  label: test_yaml_launch_and_mount on azure
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index 928a79c0ded..fb22f52afec 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -9,857 +9,857 @@ env:
 steps:
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_gcs_regions on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_spot_recovery on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
-  if: build.env.gcp == '1'
-  label: test_clone_disk_gcp on gcp
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_storage_mounts_with_stop on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-  if: build.env.azure == '1'
-  label: test_azure_region on azure
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_list_source on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
-  if: build.env.aws == '1'
-  label: test_aws_disk_tier on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+    --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_cancellation_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_http_server_with_custom_ports on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+  if: build.env("aws") == "1"
+  label: test_autostop on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
-    --aws
-  if: build.env.aws == '1'
-  label: test_fast_large_job_queue on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+  if: build.env("aws") == "1"
+  label: test_cli_logs on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_disk_tier on gcp
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
     --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_kubernetes_http_server_with_custom_ports on kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_custom_image on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
-  if: build.env.aws == '1'
-  label: test_multi_echo on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+  if: build.env("aws") == "1"
+  label: test_launch_fast on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
-  if: build.env.aws == '1'
-  label: test_minimal on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
+  if: build.env("azure") == "1"
+  label: test_skyserve_azure_http on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
     --aws
-  if: build.env.aws == '1'
-  label: test_multiple_accelerators_unordered on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_mig on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_kubernetes_context_switch on kubernetes
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_storage on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
-  if: build.env.aws == '1'
-  label: test_autodown on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_aws_regions on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
     --aws
-  if: build.env.aws == '1'
-  label: test_multiple_accelerators_unordered_with_default on aws
+  if: build.env("aws") == "1"
+  label: test_long_setup_run_script on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_streaming on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_private_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_http_server_with_custom_ports on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
-    --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_recovery_aws on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
+  if: build.env("aws") == "1"
+  label: test_lambda_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
-    --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_pipeline_recovery_aws on aws
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+  if: build.env("aws") == "1"
+  label: test_clone_disk_aws on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
     --aws
-  if: build.env.aws == '1'
+  if: build.env("aws") == "1"
   label: test_aws_with_ssh_proxy_command on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
+  if: build.env("aws") == "1"
+  label: test_launch_fast_with_autostop on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_skyserve_spot_recovery on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
+  if: build.env("aws") == "1"
+  label: test_cancel_pytorch on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-    --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_rolling_update on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+  if: build.env("aws") == "1"
+  label: test_inline_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_image_id_dict_zone on gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_gcp_http on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
     --aws
-  if: build.env.aws == '1'
-  label: test_multiple_accelerators_ordered on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
-  if: build.env.aws == '1'
-  label: test_aws_images on aws
+  if: build.env("aws") == "1"
+  label: test_fast_large_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-    --azure
-  if: build.env.azure == '1'
-  label: test_azure_storage_mounts_with_stop on azure
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
+  if: build.env("aws") == "1"
+  label: test_file_mounts on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_region_and_service_account on gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+  if: build.env("azure") == "1"
+  label: test_azure_region on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-  if: build.env.aws == '1'
-  label: test_inline_env on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
+    aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-  if: build.env.azure == '1'
-  label: test_azure_disk_tier on azure
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_pipeline_recovery_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
     --aws
-  if: build.env.aws == '1'
-  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_inline_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-  if: build.env.gcp == '1'
-  label: test_stop_gcp_spot on gcp
+  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+  if: build.env("azure") == "1"
+  label: test_azure_images on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-    --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_large_readiness_timeout on aws
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_zone on gcp
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
-    --aws
-  if: build.env.aws == '1'
-  label: test_aws_stale_job_manual_restart on aws
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_context_failover on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-  if: build.env.aws == '1'
-  label: test_image_no_conda on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+  if: build.env("aws") == "1"
+  label: test_task_labels_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
-    --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_storage on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
+  if: build.env("aws") == "1"
+  label: test_job_pipeline on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
     --aws
-  if: build.env.aws == '1'
+  if: build.env("aws") == "1"
   label: test_managed_jobs_recovery_multi_node_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_stale_job_manual_restart on gcp
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+  if: build.env("aws") == "1"
+  label: test_aws_image_id_dict on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
-    --aws
-  if: build.env.aws == '1'
-  label: test_aws_http_server_with_custom_ports on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_llm on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-  if: build.env.gcp == '1'
-  label: test_cancel_gcp on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_task_labels_kubernetes on kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_best_tier_failover on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
-  if: build.env.aws == '1'
-  label: test_core_api_sky_launch_fast on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
+  if: build.env("aws") == "1"
+  label: test_inferentia on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_tpu_pod_slice_gke on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
     --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_base_ondemand_fallback on aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_failed_setup on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
-  if: build.env.aws == '1'
-  label: test_clone_disk_aws on aws
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+  if: build.env("gcp") == "1"
+  label: test_clone_disk_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
-  if: build.env.aws == '1'
-  label: test_autostop on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
+  if: build.env("aws") == "1"
+  label: test_large_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+  if: build.env("gcp") == "1"
+  label: test_cancel_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
     --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_pipeline_failed_setup on aws
+  if: build.env("aws") == "1"
+  label: test_aws_http_server_with_custom_ports on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_skyserve_dynamic_ondemand_fallback on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+  if: build.env("gcp") == "1"
+  label: test_task_labels_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
-  if: build.env.aws == '1'
-  label: test_job_pipeline on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_images on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_public_bucket on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_tpu on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_private_bucket on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
+  if: build.env("aws") == "1"
+  label: test_multi_node_failure on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+  if: build.env("aws") == "1"
+  label: test_docker_storage_mounts on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-  if: build.env.gcp == '1'
-  label: test_task_labels_gcp on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+  if: build.env("aws") == "1"
+  label: test_aws_custom_image on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
-  if: build.env.aws == '1'
-  label: test_multi_node_failure on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+  if: build.env("gcp") == "1"
+  label: test_tpu_vm_pod on gcp
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
     --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_storage_mounts on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-    --aws
-  if: build.env.aws == '1'
-  label: test_docker_storage_mounts on aws
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+  if: build.env("aws") == "1"
+  label: test_aws_region on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
-  if: build.env.aws == '1'
-  label: test_huggingface on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+  if: build.env("aws") == "1"
+  label: test_use_spot on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
-  if: build.env.aws == '1'
-  label: test_file_mounts on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+  if: build.env("aws") == "1"
+  label: test_example_app on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_zero_quota_failover on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+  if: build.env("aws") == "1"
+  label: test_cancel_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws
-  if: build.env.aws == '1'
-  label: test_lambda_huggingface on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_aws_http on aws
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_image_id_dict on gcp
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_container_logs_two_jobs_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
     --aws
-  if: build.env.aws == '1'
-  label: test_aws_storage_mounts_with_stop on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_public_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
-    --aws
-  if: build.env.aws == '1'
-  label: test_launch_fast_with_autostop on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+  if: build.env("aws") == "1"
+  label: test_distributed_tf on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
-  if: build.env.aws == '1'
-  label: test_aws_image_id_dict on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_image_id_dict_region on gcp
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
+  if: build.env("gcp") == "1"
+  label: test_core_api_sky_launch_exec on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
-    --azure
-  if: build.env.azure == '1'
-  label: test_azure_best_tier_failover on azure
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+  if: build.env("aws") == "1"
+  label: test_multiple_resources on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_task_labels_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
     --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_readiness_timeout_fail on aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_new_autoscaler_update on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
-  if: build.env.aws == '1'
-  label: test_multi_hostname on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
+  if: build.env("azure") == "1"
+  label: test_azure_disk_tier on azure
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
-    --aws
-  if: build.env.aws == '1'
-  label: test_multiple_accelerators_ordered_with_default on aws
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_context_switch on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
-  if: build.env.azure == '1'
-  label: test_skyserve_azure_http on azure
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_update on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
     --aws
-  if: build.env.aws == '1'
-  label: test_job_queue_multinode on aws
+  if: build.env("aws") == "1"
+  label: test_docker_preinstalled_package on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
-    aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_gcs_regions on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_fast_update on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+    --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_large_readiness_timeout on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_aws_http on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+    --aws
+  if: build.env("aws") == "1"
+  label: test_job_queue_multinode on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-    --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_new_autoscaler_update on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_streaming on aws
 - agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_kubernetes_storage_mounts on kubernetes
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+  if: build.env("gcp") == "1"
+  label: test_tpu on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-  if: build.env.azure == '1'
-  label: test_azure_images on azure
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+    --aws
+  if: build.env("aws") == "1"
+  label: test_multiple_accelerators_unordered on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-  if: build.env.aws == '1'
-  label: test_use_spot on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_cancellation_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-  if: build.env.gcp == '1'
-  label: test_managed_jobs_tpu on gcp
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
+    --aws
+  if: build.env("aws") == "1"
+  label: test_multiple_accelerators_ordered_with_default on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_update on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_image_id_dict_region on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_start_stop on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+  if: build.env("azure") == "1"
+  label: test_cancel_azure on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-    --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_user_bug_restart on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_force_enable_external_ips on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
-  if: build.env.aws == '1'
-  label: test_env_check on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
+  if: build.env("aws") == "1"
+  label: test_core_api_sky_launch_fast on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_skyserve_auto_restart on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_start_stop_two_nodes on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_storage_mounts_with_stop on gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+  if: build.env("aws") == "1"
+  label: test_aws_zone on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
     --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_inline_env on aws
+  if: build.env("aws") == "1"
+  label: test_aws_zero_quota_failover on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
     --aws
-  if: build.env.aws == '1'
+  if: build.env("aws") == "1"
   label: test_using_file_mounts_with_env_vars on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-  if: build.env.aws == '1'
-  label: test_aws_custom_image on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-  if: build.env.aws == '1'
-  label: test_job_queue on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
     --aws
-  if: build.env.aws == '1'
-  label: test_long_setup_run_script on aws
+  if: build.env("aws") == "1"
+  label: test_multiple_accelerators_unordered_with_default on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_failures on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
+    --aws
+  if: build.env("aws") == "1"
+  label: test_multiple_accelerators_ordered on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
     --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_kubernetes_custom_image on kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
 - agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_skyserve_kubernetes_http on kubernetes
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+  if: build.env("aws") == "1"
+  label: test_minimal on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-  if: build.env.gcp == '1'
-  label: test_tpu_vm on gcp
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_storage_mounts_with_stop on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-  if: build.env.aws == '1'
-  label: test_inline_env_file on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
+  if: build.env("aws") == "1"
+  label: test_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+  if: build.env("aws") == "1"
+  label: test_aws_images on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
     --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_load_balancer on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_managed_jobs_pipeline_recovery_gcp on gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+    --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_pipeline_recovery_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
-  if: build.env.aws == '1'
-  label: test_distributed_tf on aws
+  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
+  if: build.env("aws") == "1"
+  label: test_image_no_conda on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
+  if: build.env("aws") == "1"
+  label: test_user_dependencies on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
+  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
     --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_failed_setup on aws
+  if: build.env("aws") == "1"
+  label: test_aws_stale_job_manual_restart on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
     --gcp
-  if: build.env.gcp == '1'
-  label: test_managed_jobs_recovery_multi_node_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
-  if: build.env.aws == '1'
-  label: test_multiple_resources on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-    --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_container_logs_two_jobs_kubernetes on kubernetes
+  if: build.env("gcp") == "1"
+  label: test_gcp_zero_quota_failover on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-  if: build.env.aws == '1'
-  label: test_cancel_pytorch on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_image_id_dict_zone on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
-  if: build.env.aws == '1'
-  label: test_stale_job on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+  if: build.env("azure") == "1"
+  label: test_azure_start_stop on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-    --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_force_enable_external_ips on gcp
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
+  if: build.env("aws") == "1"
+  label: test_aws_image_id_dict_zone on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-  if: build.env.azure == '1'
-  label: test_cancel_azure on azure
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+    --aws
+  if: build.env("aws") == "1"
+  label: test_aws_image_id_dict_region on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_aws_regions on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+  if: build.env("aws") == "1"
+  label: test_inline_env_file on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
     --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_update_autoscale on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_invalid_names on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
     --aws
-  if: build.env.aws == '1'
-  label: test_job_queue_with_docker on aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_pipeline_failed_setup on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
     --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_kubernetes_context_failover on kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
-  if: build.env.aws == '1'
-  label: test_cli_logs on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+  if: build.env("aws") == "1"
+  label: test_stale_job on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
     --aws
-  if: build.env.aws == '1'
-  label: test_aws_zero_quota_failover on aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_rolling_update on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
-  if: build.env.aws == '1'
-  label: test_aws_zone on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+  if: build.env("aws") == "1"
+  label: test_autodown on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-  if: build.env.aws == '1'
-  label: test_cancel_aws on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
+  if: build.env("gcp") == "1"
+  label: test_tpu_vm on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
     --aws
-  if: build.env.aws == '1'
-  label: test_aws_image_id_dict_region on aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_recovery_default_resources on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
-  if: build.env.aws == '1'
-  label: test_lambda_job_queue on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws
+  if: build.env("aws") == "1"
+  label: test_lambda_huggingface on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    on aws
+  if: build.env("aws") == "1"
+  label: test_job_queue_with_docker on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-    --azure
-  if: build.env.azure == '1'
-  label: test_azure_start_stop_two_nodes on azure
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_failures on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
-  if: build.env.aws == '1'
-  label: test_task_labels_aws on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+    --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_base_ondemand_fallback on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
-    --aws
-  if: build.env.aws == '1'
-  label: test_docker_preinstalled_package on aws
+  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
+  if: build.env("aws") == "1"
+  label: test_sky_bench on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
-    --azure
-  if: build.env.azure == '1'
-  label: test_azure_http_server_with_custom_ports on azure
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_cancel on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
-  if: build.env.gcp == '1'
-  label: test_tpu_vm_pod on gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_dynamic_ondemand_fallback on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
-  if: build.env.aws == '1'
-  label: test_launch_fast on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_recovery_multi_node_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_update_autoscale on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+  if: build.env("aws") == "1"
+  label: test_env_check on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_fast_update on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
     --aws
-  if: build.env.aws == '1'
-  label: test_custom_default_conda_env on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
     --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_invalid_names on aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-  if: build.env.aws == '1'
-  label: test_aws_region on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+  if: build.env("aws") == "1"
+  label: test_multi_hostname on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
-  if: build.env.aws == '1'
-  label: test_example_app on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+  if: build.env("aws") == "1"
+  label: test_multi_echo on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_http_server_with_custom_ports on kubernetes
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_skyserve_kubernetes_http on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_cancel on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+  if: build.env("aws") == "1"
+  label: test_huggingface on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
     --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_cancellation_aws on aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_load_balancer on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-  if: build.env.aws == '1'
-  label: test_inferentia on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
+  if: build.env("gcp") == "1"
+  label: test_stop_gcp_spot on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-  if: build.env.gcp == '1'
-  label: test_skyserve_gcp_http on gcp
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_tpu_pod_slice_gke on kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_container_logs_multinode_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_images on gcp
+  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
+    --aws
+  if: build.env("aws") == "1"
+  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
     --kubernetes
-  if: build.env.kubernetes == '1'
-  label: test_container_logs_multinode_kubernetes on kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
-  if: build.env.gcp == '1'
-  label: test_tpu on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_http_server_with_custom_ports on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
     --gcp
-  if: build.env.gcp == '1'
-  label: test_managed_jobs_cancellation_gcp on gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_recovery_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
-  if: build.env.aws == '1'
-  label: test_skyserve_llm on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+    --aws
+  if: build.env("aws") == "1"
+  label: test_aws_storage_mounts_with_stop on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-  if: build.env.aws == '1'
-  label: test_sky_bench on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_start_stop on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
-  if: build.env.aws == '1'
-  label: test_large_job_queue on aws
+  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+    --aws
+  if: build.env("aws") == "1"
+  label: test_custom_default_conda_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_disk_tier on gcp
+  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_stale_job_manual_restart on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-  if: build.env.aws == '1'
-  label: test_user_dependencies on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_mig on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_auto_restart on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
-    --aws
-  if: build.env.aws == '1'
-  label: TestStorageWithCredentials::test_list_source on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+  if: build.env("aws") == "1"
+  label: test_aws_disk_tier on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
-  if: build.env.gcp == '1'
-  label: test_core_api_sky_launch_exec on gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+    --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_recovery_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
-  if: build.env.aws == '1'
-  label: test_aws_image_id_dict_zone on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-  if: build.env.gcp == '1'
-  label: test_gcp_zone on gcp
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_image_id_dict on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
     --gcp
-  if: build.env.gcp == '1'
-  label: test_managed_jobs_recovery_gcp on gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_region_and_service_account on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
-  if: build.env.azure == '1'
-  label: test_azure_start_stop on azure
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+    --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_user_bug_restart on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
     --aws
-  if: build.env.aws == '1'
-  label: test_managed_jobs_recovery_default_resources on aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_readiness_timeout_fail on aws

From 115af3065dc8eccf87b032dab1e7714f24963e93 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Fri, 29 Nov 2024 23:38:51 +0800
Subject: [PATCH 45/64] exclude lambda cloud

---
 .buildkite/generate_pipeline.py               |   3 +-
 .../pipeline_smoke_tests_pre_merge.yaml       |  12 +-
 .buildkite/pipeline_smoke_tests_release.yaml  | 862 +++++++++---------
 3 files changed, 434 insertions(+), 443 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 3c3b9c41edf..baf72d09726 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -12,7 +12,8 @@
 
 ALL_CLOUDS_IN_SMOKE_TESTS = [
     'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
-    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod'
+    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod',
+    'lambda_cloud'
 ]
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
 QUEUE_KUBERNETES = 'kubernetes'
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
index 35ba7ea17ec..d3e992a3189 100644
--- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml
+++ b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
@@ -3,15 +3,15 @@ env:
   LOG_TO_STDOUT: '1'
   PYTHONPATH: ${PYTHONPATH}:$(pwd)
 steps:
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --aws
-  if: build.env("aws") == "1"
-  label: test_yaml_launch_and_mount on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
     --azure
   if: build.env("azure") == "1"
   label: test_yaml_launch_and_mount on azure
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+    --aws
+  if: build.env("aws") == "1"
+  label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
index fb22f52afec..c173e2ae0ca 100644
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ b/.buildkite/pipeline_smoke_tests_release.yaml
@@ -9,236 +9,199 @@ env:
 steps:
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_spot_recovery on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_storage_mounts_with_stop on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_list_source on aws
+  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
     --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_cancellation_aws on aws
+  label: TestStorageWithCredentials::test_gcs_regions on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+    --aws
   if: build.env("aws") == "1"
-  label: test_autostop on aws
+  label: test_job_queue_with_docker on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+    --aws
   if: build.env("aws") == "1"
-  label: test_cli_logs on aws
+  label: test_skyserve_rolling_update on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
+    --gcp
   if: build.env("gcp") == "1"
-  label: test_gcp_disk_tier on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_custom_image on kubernetes
+  label: test_gcp_region_and_service_account on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
+    --aws
   if: build.env("aws") == "1"
-  label: test_launch_fast on aws
+  label: test_managed_jobs_recovery_multi_node_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
-  if: build.env("azure") == "1"
-  label: test_skyserve_azure_http on azure
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_zero_quota_failover on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
     --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_storage on aws
+  label: test_managed_jobs_pipeline_recovery_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_aws_regions on aws
+  label: test_docker_preinstalled_package on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-    --aws
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
+  label: test_clone_disk_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
     --aws
   if: build.env("aws") == "1"
-  label: test_long_setup_run_script on aws
+  label: test_skyserve_readiness_timeout_fail on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_private_bucket on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_http_server_with_custom_ports on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_job_queue --aws
-  if: build.env("aws") == "1"
-  label: test_lambda_job_queue on aws
+  label: test_using_file_mounts_with_env_vars on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
   if: build.env("aws") == "1"
-  label: test_clone_disk_aws on aws
+  label: test_huggingface on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
     --aws
   if: build.env("aws") == "1"
-  label: test_aws_with_ssh_proxy_command on aws
+  label: test_managed_jobs_failed_setup on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
     --aws
   if: build.env("aws") == "1"
-  label: test_launch_fast_with_autostop on aws
+  label: test_docker_storage_mounts on aws
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-  if: build.env("aws") == "1"
-  label: test_cancel_pytorch on aws
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+    --aws
   if: build.env("aws") == "1"
-  label: test_inline_env on aws
+  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_gcp_http on gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
+    --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_inline_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
     --aws
   if: build.env("aws") == "1"
-  label: test_fast_large_job_queue on aws
+  label: test_skyserve_base_ondemand_fallback on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
   if: build.env("aws") == "1"
-  label: test_file_mounts on aws
+  label: test_skyserve_failures on aws
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-  if: build.env("azure") == "1"
-  label: test_azure_region on azure
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_context_failover on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-    --aws
+  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
-    aws
+  label: test_aws_images on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
     --gcp
   if: build.env("gcp") == "1"
-  label: test_managed_jobs_pipeline_recovery_gcp on gcp
+  label: test_skyserve_spot_recovery on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
-    --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_inline_env on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-  if: build.env("azure") == "1"
-  label: test_azure_images on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_zone on gcp
+  label: test_stale_job on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
     --kubernetes
   if: build.env("kubernetes") == "1"
-  label: test_kubernetes_context_failover on kubernetes
+  label: test_kubernetes_custom_image on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+    --aws
   if: build.env("aws") == "1"
-  label: test_task_labels_aws on aws
+  label: test_custom_default_conda_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
-  if: build.env("aws") == "1"
-  label: test_job_pipeline on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_image_id_dict_zone on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
-    --aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_force_enable_external_ips on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_multi_node_aws on aws
+  label: test_multi_node_failure on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
   if: build.env("aws") == "1"
-  label: test_aws_image_id_dict on aws
+  label: test_core_api_sky_launch_fast on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+    --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_llm on aws
+  label: test_skyserve_load_balancer on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
     --azure
   if: build.env("azure") == "1"
-  label: test_azure_best_tier_failover on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-  if: build.env("aws") == "1"
-  label: test_inferentia on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_tpu_pod_slice_gke on kubernetes
+  label: test_azure_http_server_with_custom_ports on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
-    --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_failed_setup on aws
+  label: test_cli_logs on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
+    --gcp
   if: build.env("gcp") == "1"
-  label: test_clone_disk_gcp on gcp
+  label: test_gcp_image_id_dict_region on gcp
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
@@ -246,122 +209,127 @@ steps:
   label: test_large_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
   if: build.env("gcp") == "1"
-  label: test_cancel_gcp on gcp
+  label: test_tpu on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
     --aws
   if: build.env("aws") == "1"
-  label: test_aws_http_server_with_custom_ports on aws
+  label: test_aws_storage_mounts_with_stop on aws
 - agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-  if: build.env("gcp") == "1"
-  label: test_task_labels_gcp on gcp
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_storage_mounts on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_images on gcp
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
+    --aws
+  if: build.env("aws") == "1"
+  label: test_launch_fast_with_autostop on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
+    --gcp
   if: build.env("gcp") == "1"
-  label: test_managed_jobs_tpu on gcp
+  label: test_managed_jobs_recovery_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
   if: build.env("aws") == "1"
-  label: test_multi_node_failure on aws
+  label: test_skyserve_cancel on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-    --aws
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
   if: build.env("aws") == "1"
-  label: test_docker_storage_mounts on aws
+  label: test_aws_zone on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-  if: build.env("aws") == "1"
-  label: test_aws_custom_image on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
+    --gcp
   if: build.env("gcp") == "1"
-  label: test_tpu_vm_pod on gcp
+  label: test_skyserve_auto_restart on gcp
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
     --kubernetes
   if: build.env("kubernetes") == "1"
-  label: test_kubernetes_storage_mounts on kubernetes
+  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
+    --aws
   if: build.env("aws") == "1"
-  label: test_aws_region on aws
+  label: test_fast_large_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
   if: build.env("aws") == "1"
-  label: test_use_spot on aws
+  label: test_managed_jobs on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+    --aws
   if: build.env("aws") == "1"
-  label: test_example_app on aws
+  label: test_aws_image_id_dict_region on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
+    --aws
   if: build.env("aws") == "1"
-  label: test_cancel_aws on aws
+  label: test_managed_jobs_recovery_default_resources on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_zone on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_aws_http on aws
+  label: test_launch_fast on aws
 - agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_container_logs_two_jobs_kubernetes on kubernetes
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_cancellation_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_public_bucket on aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_pipeline_recovery_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
+    --aws
   if: build.env("aws") == "1"
-  label: test_distributed_tf on aws
+  label: TestStorageWithCredentials::test_private_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    on aws
+  label: test_job_queue on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
-  if: build.env("gcp") == "1"
-  label: test_core_api_sky_launch_exec on gcp
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
+  if: build.env("aws") == "1"
+  label: test_aws_image_id_dict on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
   if: build.env("aws") == "1"
-  label: test_multiple_resources on aws
+  label: test_skyserve_update on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
     --kubernetes
   if: build.env("kubernetes") == "1"
-  label: test_task_labels_kubernetes on kubernetes
+  label: test_skyserve_kubernetes_http on kubernetes
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
@@ -370,394 +338,366 @@ steps:
   label: test_skyserve_new_autoscaler_update on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-  if: build.env("azure") == "1"
-  label: test_azure_disk_tier on azure
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
+    --aws
+  if: build.env("aws") == "1"
+  label: TestStorageWithCredentials::test_aws_regions on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
     --kubernetes
   if: build.env("kubernetes") == "1"
-  label: test_kubernetes_context_switch on kubernetes
+  label: test_task_labels_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_update on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_disk_tier on gcp
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_http_server_with_custom_ports on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
-    --aws
-  if: build.env("aws") == "1"
-  label: test_docker_preinstalled_package on aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_dynamic_ondemand_fallback on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-    --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_gcs_regions on aws
+  label: test_inline_env on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_tpu on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_large_readiness_timeout on aws
+  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
+    aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
+  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
     --aws
   if: build.env("aws") == "1"
-  label: test_job_queue_multinode on aws
+  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_streaming on aws
+  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
+  if: build.env("azure") == "1"
+  label: test_azure_images on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
   if: build.env("gcp") == "1"
-  label: test_tpu on gcp
+  label: test_tpu_vm on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
     --aws
   if: build.env("aws") == "1"
-  label: test_multiple_accelerators_unordered on aws
+  label: test_managed_jobs_recovery_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
+  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
     --gcp
   if: build.env("gcp") == "1"
-  label: test_managed_jobs_cancellation_gcp on gcp
+  label: test_gcp_stale_job_manual_restart on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
-    --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
   if: build.env("aws") == "1"
-  label: test_multiple_accelerators_ordered_with_default on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict_region on gcp
+  label: test_use_spot on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
   if: build.env("azure") == "1"
-  label: test_cancel_azure on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_force_enable_external_ips on gcp
+  label: test_azure_start_stop on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
   if: build.env("aws") == "1"
-  label: test_core_api_sky_launch_fast on aws
+  label: test_autodown on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_start_stop_two_nodes on azure
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
+  if: build.env("gcp") == "1"
+  label: test_cancel_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
   if: build.env("aws") == "1"
-  label: test_aws_zone on aws
+  label: test_user_dependencies on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
-    --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
   if: build.env("aws") == "1"
-  label: test_aws_zero_quota_failover on aws
+  label: test_autostop on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
     --aws
   if: build.env("aws") == "1"
-  label: test_using_file_mounts_with_env_vars on aws
+  label: test_job_queue_multinode on aws
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_kubernetes_context_switch on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
+  if: build.env("azure") == "1"
+  label: test_cancel_azure on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
     --aws
   if: build.env("aws") == "1"
-  label: test_multiple_accelerators_unordered_with_default on aws
+  label: test_managed_jobs_cancellation_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
-    --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
   if: build.env("aws") == "1"
-  label: test_multiple_accelerators_ordered on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
+  label: test_sky_bench on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
   if: build.env("aws") == "1"
-  label: test_minimal on aws
+  label: test_skyserve_aws_http on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-    --gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
   if: build.env("gcp") == "1"
-  label: test_gcp_storage_mounts_with_stop on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-  if: build.env("aws") == "1"
-  label: test_job_queue on aws
+  label: test_gcp_start_stop on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
   if: build.env("aws") == "1"
-  label: test_aws_images on aws
+  label: test_multi_hostname on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
+  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
+    on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
+  label: test_multiple_accelerators_ordered_with_default on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
     --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_pipeline_recovery_aws on aws
+  label: test_managed_jobs_storage on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-  if: build.env("aws") == "1"
-  label: test_image_no_conda on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-  if: build.env("aws") == "1"
-  label: test_user_dependencies on aws
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_mig on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
     --aws
   if: build.env("aws") == "1"
-  label: test_aws_stale_job_manual_restart on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_zero_quota_failover on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict_zone on gcp
+  label: test_multiple_accelerators_ordered on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
   if: build.env("azure") == "1"
-  label: test_azure_start_stop on azure
+  label: test_azure_disk_tier on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
   if: build.env("aws") == "1"
-  label: test_aws_image_id_dict_zone on aws
+  label: test_env_check on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
     --aws
   if: build.env("aws") == "1"
-  label: test_aws_image_id_dict_region on aws
+  label: test_skyserve_update_autoscale on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
   if: build.env("aws") == "1"
-  label: test_inline_env_file on aws
+  label: test_cancel_aws on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs on aws
+  label: test_inferentia on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_invalid_names on aws
+  label: test_aws_stale_job_manual_restart on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
-    --aws
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_pipeline_failed_setup on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
+  label: test_aws_region on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
+  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
   if: build.env("aws") == "1"
-  label: test_stale_job on aws
+  label: test_job_pipeline on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_image_id_dict on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_rolling_update on aws
+  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
   if: build.env("aws") == "1"
-  label: test_autodown on aws
+  label: test_distributed_tf on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-  if: build.env("gcp") == "1"
-  label: test_tpu_vm on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_start_stop_two_nodes on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
-    --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_default_resources on aws
+  label: test_multi_echo on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_lambda_huggingface --aws
-  if: build.env("aws") == "1"
-  label: test_lambda_huggingface on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_storage_mounts_with_stop on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
     --aws
   if: build.env("aws") == "1"
-  label: test_job_queue_with_docker on aws
+  label: TestStorageWithCredentials::test_list_source on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_failures on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_http_server_with_custom_ports on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
+  if: build.env("gcp") == "1"
+  label: test_skyserve_gcp_http on gcp
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_base_ondemand_fallback on aws
+  label: TestStorageWithCredentials::test_public_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-  if: build.env("aws") == "1"
-  label: test_sky_bench on aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
+  if: build.env("gcp") == "1"
+  label: test_task_labels_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
+  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_cancel on aws
+  label: test_aws_image_id_dict_zone on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_dynamic_ondemand_fallback on gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
+    --azure
+  if: build.env("azure") == "1"
+  label: test_azure_best_tier_failover on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_recovery_multi_node_gcp on gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_llm on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-    --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_update_autoscale on aws
+  label: test_example_app on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
+    --aws
   if: build.env("aws") == "1"
-  label: test_env_check on aws
+  label: TestStorageWithCredentials::test_invalid_names on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+    --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_fast_update on aws
+  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
+  label: test_aws_zero_quota_failover on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
+  label: test_skyserve_user_bug_restart on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
   if: build.env("aws") == "1"
-  label: test_multi_hostname on aws
+  label: test_cancel_pytorch on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
+  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
   if: build.env("aws") == "1"
-  label: test_multi_echo on aws
+  label: test_minimal on aws
 - agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_http_server_with_custom_ports on kubernetes
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
+  if: build.env("aws") == "1"
+  label: test_inline_env_file on aws
 - agents:
     queue: kubernetes
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
     --kubernetes
   if: build.env("kubernetes") == "1"
-  label: test_skyserve_kubernetes_http on kubernetes
+  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
+  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
   if: build.env("aws") == "1"
-  label: test_huggingface on aws
+  label: test_image_no_conda on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_load_balancer on aws
+  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
 - agents:
     queue: generic_cloud
   command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
   if: build.env("gcp") == "1"
   label: test_stop_gcp_spot on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
 - agents:
     queue: kubernetes
   command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
@@ -766,100 +706,150 @@ steps:
   label: test_container_logs_multinode_kubernetes on kubernetes
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
-    --aws
-  if: build.env("aws") == "1"
-  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_gcp_storage_mounts_with_stop on gcp
 - agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
+  if: build.env("aws") == "1"
+  label: test_multiple_resources on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_http_server_with_custom_ports on azure
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
+  if: build.env("aws") == "1"
+  label: test_file_mounts on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
-    --gcp
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
   if: build.env("gcp") == "1"
-  label: test_managed_jobs_recovery_gcp on gcp
+  label: test_tpu_vm_pod on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
     --aws
   if: build.env("aws") == "1"
-  label: test_aws_storage_mounts_with_stop on aws
+  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+    on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_start_stop on gcp
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
+    --aws
+  if: build.env("aws") == "1"
+  label: test_managed_jobs_pipeline_failed_setup on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
     --aws
   if: build.env("aws") == "1"
-  label: test_custom_default_conda_env on aws
+  label: test_aws_http_server_with_custom_ports on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
-    --gcp
+  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
   if: build.env("gcp") == "1"
-  label: test_gcp_stale_job_manual_restart on gcp
+  label: test_gcp_images on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_mig on gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_streaming on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_auto_restart on gcp
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
+  if: build.env("azure") == "1"
+  label: test_skyserve_azure_http on azure
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
+    --aws
   if: build.env("aws") == "1"
-  label: test_aws_disk_tier on aws
+  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
+  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
     --aws
   if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_aws on aws
+  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
+  if: build.env("aws") == "1"
+  label: test_skyserve_fast_update on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
+  if: build.env("aws") == "1"
+  label: test_task_labels_aws on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
     --aws
   if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    on aws
+  label: test_multiple_accelerators_unordered on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
+  if: build.env("azure") == "1"
+  label: test_azure_region on azure
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
+    --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_container_logs_two_jobs_kubernetes on kubernetes
+- agents:
+    queue: kubernetes
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
+  if: build.env("kubernetes") == "1"
+  label: test_tpu_pod_slice_gke on kubernetes
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
   if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict on gcp
+  label: test_core_api_sky_launch_exec on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-    --gcp
+  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
+    --aws
+  if: build.env("aws") == "1"
+  label: test_aws_with_ssh_proxy_command on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
+  if: build.env("aws") == "1"
+  label: test_aws_custom_image on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
+    --aws
+  if: build.env("aws") == "1"
+  label: test_long_setup_run_script on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
   if: build.env("gcp") == "1"
-  label: test_gcp_region_and_service_account on gcp
+  label: test_clone_disk_gcp on gcp
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
+  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_user_bug_restart on aws
+  label: test_multiple_accelerators_unordered_with_default on aws
 - agents:
     queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
+  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
+  if: build.env("aws") == "1"
+  label: test_aws_disk_tier on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
     --aws
   if: build.env("aws") == "1"
-  label: test_skyserve_readiness_timeout_fail on aws
+  label: test_skyserve_large_readiness_timeout on aws
+- agents:
+    queue: generic_cloud
+  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
+    --gcp
+  if: build.env("gcp") == "1"
+  label: test_managed_jobs_recovery_multi_node_gcp on gcp

From 0c7bfd503f98206895b9a4357a678801154be1d8 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Sat, 30 Nov 2024 12:48:55 +0800
Subject: [PATCH 46/64] dynamic generate pipeline

---
 .buildkite/generate_pipeline.py               |  28 +-
 .../pipeline_smoke_tests_pre_merge.yaml       |  17 -
 .buildkite/pipeline_smoke_tests_release.yaml  | 855 ------------------
 .pre-commit-config.yaml                       |   2 +-
 4 files changed, 25 insertions(+), 877 deletions(-)
 delete mode 100644 .buildkite/pipeline_smoke_tests_pre_merge.yaml
 delete mode 100644 .buildkite/pipeline_smoke_tests_release.yaml

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index baf72d09726..bd01bcf6bb4 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -1,4 +1,24 @@
-"""This script generates a Buildkite pipeline from test files."""
+"""
+This script generates a Buildkite pipeline from test files.
+
+The script will generate two pipelines:
+
+tests/smoke_tests
+├── test_*.py -> release pipeline
+├── test_required_before_merge.py -> pre-merge pipeline
+
+1. release pipeline, which runs all smoke tests by default, some function
+   support tests by multiple clouds, but we only generate one cloud per test
+   function to save cost.
+2. pre-merge pipeline, which generates all clouds supported by the test
+   function, author should specify which clouds to run by setting env in the
+   step.
+
+We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP) now,
+smoke tests for those clouds are generated, other clouds are not supported
+yet, smoke tests for those clouds are not generated.
+"""
+
 import ast
 from collections import defaultdict
 import copy
@@ -32,7 +52,7 @@
     'edit directly.\n'
 )
 
-    
+
 def _get_full_decorator_path(decorator: ast.AST) -> str:
     """Recursively get the full path of a decorator."""
     if isinstance(decorator, ast.Attribute):
@@ -169,8 +189,8 @@ def _convert_pre_merge(test_files: List[str]):
     output_file_pipelines_map = defaultdict(list)
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
-        # We want enable all clouds by default for each test function 
-        # for pre-merge. And let the author controls which clouds 
+        # We want enable all clouds by default for each test function
+        # for pre-merge. And let the author controls which clouds
         # to run by parameter.
         pipeline = _generate_pipeline(test_file, False)
         output_file_pipelines_map[yaml_file_path].append(pipeline)
diff --git a/.buildkite/pipeline_smoke_tests_pre_merge.yaml b/.buildkite/pipeline_smoke_tests_pre_merge.yaml
deleted file mode 100644
index d3e992a3189..00000000000
--- a/.buildkite/pipeline_smoke_tests_pre_merge.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-env:
-  LOG_TO_STDOUT: '1'
-  PYTHONPATH: ${PYTHONPATH}:$(pwd)
-steps:
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --azure
-  if: build.env("azure") == "1"
-  label: test_yaml_launch_and_mount on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
-    --aws
-  if: build.env("aws") == "1"
-  label: test_yaml_launch_and_mount on aws
diff --git a/.buildkite/pipeline_smoke_tests_release.yaml b/.buildkite/pipeline_smoke_tests_release.yaml
deleted file mode 100644
index c173e2ae0ca..00000000000
--- a/.buildkite/pipeline_smoke_tests_release.yaml
+++ /dev/null
@@ -1,855 +0,0 @@
-# This is an auto-generated Buildkite pipeline by .buildkite/generate_pipeline.py, Please do not edit directly.
-env:
-  LOG_TO_STDOUT: '1'
-  PYTHONPATH: ${PYTHONPATH}:$(pwd)
-  aws: '1'
-  azure: '1'
-  gcp: '1'
-  kubernetes: '1'
-steps:
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_bulk_deletion
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_bucket_bulk_deletion on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_gcs_regions
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_gcs_regions on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_with_docker
-    --aws
-  if: build.env("aws") == "1"
-  label: test_job_queue_with_docker on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_rolling_update
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_rolling_update on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_region_and_service_account
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_region_and_service_account on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_aws
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_multi_node_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_zero_quota_failover
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_zero_quota_failover on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_aws
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_pipeline_recovery_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_docker_preinstalled_package
-    --aws
-  if: build.env("aws") == "1"
-  label: test_docker_preinstalled_package on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_aws --aws
-  if: build.env("aws") == "1"
-  label: test_clone_disk_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_readiness_timeout_fail
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_readiness_timeout_fail on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_using_file_mounts_with_env_vars
-    --aws
-  if: build.env("aws") == "1"
-  label: test_using_file_mounts_with_env_vars on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_huggingface --aws
-  if: build.env("aws") == "1"
-  label: test_huggingface on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_failed_setup
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_failed_setup on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_docker_storage_mounts
-    --aws
-  if: build.env("aws") == "1"
-  label: test_docker_storage_mounts on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_simultaneous_jobs_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_container_logs_two_simultaneous_jobs_kubernetes on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_new_bucket_creation_and_deletion
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_new_bucket_creation_and_deletion on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_inline_env
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_inline_env on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_base_ondemand_fallback
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_base_ondemand_fallback on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_failures --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_failures on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_basic.py::test_kubernetes_context_failover
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_context_failover on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_images --aws
-  if: build.env("aws") == "1"
-  label: test_aws_images on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_spot_recovery
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_spot_recovery on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_stale_job --aws
-  if: build.env("aws") == "1"
-  label: test_stale_job on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_custom_image
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_custom_image on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_custom_default_conda_env
-    --aws
-  if: build.env("aws") == "1"
-  label: test_custom_default_conda_env on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_zone --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict_zone on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_force_enable_external_ips
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_force_enable_external_ips on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_node_failure --aws
-  if: build.env("aws") == "1"
-  label: test_multi_node_failure on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_fast --aws
-  if: build.env("aws") == "1"
-  label: test_core_api_sky_launch_fast on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_load_balancer
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_load_balancer on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_http_server_with_custom_ports
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_http_server_with_custom_ports on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_cli_logs --aws
-  if: build.env("aws") == "1"
-  label: test_cli_logs on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict_region
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict_region on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_large_job_queue --aws
-  if: build.env("aws") == "1"
-  label: test_large_job_queue on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu --gcp
-  if: build.env("gcp") == "1"
-  label: test_tpu on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_aws_storage_mounts_with_stop
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_storage_mounts_with_stop on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_storage_mounts
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_storage_mounts on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast_with_autostop
-    --aws
-  if: build.env("aws") == "1"
-  label: test_launch_fast_with_autostop on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_gcp
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_recovery_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_cancel --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_cancel on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_zone --aws
-  if: build.env("aws") == "1"
-  label: test_aws_zone on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_auto_restart
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_auto_restart on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_pod_annotations_for_autodown_with_launch
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_add_pod_annotations_for_autodown_with_launch on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_fast_large_job_queue
-    --aws
-  if: build.env("aws") == "1"
-  label: test_fast_large_job_queue on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_region
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_image_id_dict_region on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_default_resources
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_default_resources on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_gcp_zone --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_zone on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_launch_fast --aws
-  if: build.env("aws") == "1"
-  label: test_launch_fast on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_gcp
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_cancellation_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_recovery_gcp
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_pipeline_recovery_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_private_bucket
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_private_bucket on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue --aws
-  if: build.env("aws") == "1"
-  label: test_job_queue on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict --aws
-  if: build.env("aws") == "1"
-  label: test_aws_image_id_dict on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_update on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_kubernetes_http
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_skyserve_kubernetes_http on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_new_autoscaler_update on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_aws_regions
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_aws_regions on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_task_labels_kubernetes on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_disk_tier --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_disk_tier on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_kubernetes_http_server_with_custom_ports
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_http_server_with_custom_ports on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_dynamic_ondemand_fallback
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_dynamic_ondemand_fallback on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env --aws
-  if: build.env("aws") == "1"
-  label: test_inline_env on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_tpu --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_tpu on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_multiple_buckets_creation_and_deletion on
-    aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::TestYamlSpecs::test_load_dump_yaml_config_equivalent
-    --aws
-  if: build.env("aws") == "1"
-  label: TestYamlSpecs::test_load_dump_yaml_config_equivalent on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_azure_images --azure
-  if: build.env("azure") == "1"
-  label: test_azure_images on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm --gcp
-  if: build.env("gcp") == "1"
-  label: test_tpu_vm on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_aws
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_recovery_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_gcp_stale_job_manual_restart
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_stale_job_manual_restart on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_use_spot --aws
-  if: build.env("aws") == "1"
-  label: test_use_spot on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop --azure
-  if: build.env("azure") == "1"
-  label: test_azure_start_stop on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autodown --aws
-  if: build.env("aws") == "1"
-  label: test_autodown on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_gcp --gcp
-  if: build.env("gcp") == "1"
-  label: test_cancel_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_user_dependencies --aws
-  if: build.env("aws") == "1"
-  label: test_user_dependencies on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_autostop --aws
-  if: build.env("aws") == "1"
-  label: test_autostop on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_job_queue_multinode
-    --aws
-  if: build.env("aws") == "1"
-  label: test_job_queue_multinode on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_kubernetes_context_switch
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_kubernetes_context_switch on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_azure --azure
-  if: build.env("azure") == "1"
-  label: test_cancel_azure on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_cancellation_aws
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_cancellation_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_sky_bench --aws
-  if: build.env("aws") == "1"
-  label: test_sky_bench on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_aws_http --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_aws_http on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_start_stop --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_start_stop on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_hostname --aws
-  if: build.env("aws") == "1"
-  label: test_multi_hostname on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_externally_created_bucket_mount_without_source
-    on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered_with_default
-    --aws
-  if: build.env("aws") == "1"
-  label: test_multiple_accelerators_ordered_with_default on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_storage
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_storage on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_mig --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_mig on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_ordered
-    --aws
-  if: build.env("aws") == "1"
-  label: test_multiple_accelerators_ordered on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_disk_tier --azure
-  if: build.env("azure") == "1"
-  label: test_azure_disk_tier on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_env_check --aws
-  if: build.env("aws") == "1"
-  label: test_env_check on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_update_autoscale
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_update_autoscale on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_aws --aws
-  if: build.env("aws") == "1"
-  label: test_cancel_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inferentia --aws
-  if: build.env("aws") == "1"
-  label: test_inferentia on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_aws_stale_job_manual_restart
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_stale_job_manual_restart on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_region --aws
-  if: build.env("aws") == "1"
-  label: test_aws_region on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_job_pipeline --aws
-  if: build.env("aws") == "1"
-  label: test_job_pipeline on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_image_id_dict --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_image_id_dict on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_copy_mount_existing_storage
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_copy_mount_existing_storage on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_distributed_tf --aws
-  if: build.env("aws") == "1"
-  label: test_distributed_tf on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_start_stop_two_nodes
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_start_stop_two_nodes on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_multi_echo --aws
-  if: build.env("aws") == "1"
-  label: test_multi_echo on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_azure_storage_mounts_with_stop
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_storage_mounts_with_stop on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_list_source
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_list_source on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_gcp_http_server_with_custom_ports
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_http_server_with_custom_ports on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_gcp_http --gcp
-  if: build.env("gcp") == "1"
-  label: test_skyserve_gcp_http on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_public_bucket
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_public_bucket on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_gcp --gcp
-  if: build.env("gcp") == "1"
-  label: test_task_labels_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_aws_image_id_dict_zone --aws
-  if: build.env("aws") == "1"
-  label: test_aws_image_id_dict_zone on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_azure_best_tier_failover
-    --azure
-  if: build.env("azure") == "1"
-  label: test_azure_best_tier_failover on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_llm --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_llm on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_example_app --aws
-  if: build.env("aws") == "1"
-  label: test_example_app on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_invalid_names
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_invalid_names on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_to_existing_bucket
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_upload_to_existing_bucket on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_zero_quota_failover
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_zero_quota_failover on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_user_bug_restart
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_user_bug_restart on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_cancel_pytorch --aws
-  if: build.env("aws") == "1"
-  label: test_cancel_pytorch on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_minimal --aws
-  if: build.env("aws") == "1"
-  label: test_minimal on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_inline_env_file --aws
-  if: build.env("aws") == "1"
-  label: test_inline_env_file on aws
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_add_and_remove_pod_annotations_with_autostop
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_add_and_remove_pod_annotations_with_autostop on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_image_no_conda --aws
-  if: build.env("aws") == "1"
-  label: test_image_no_conda on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_bucket_external_deletion
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_bucket_external_deletion on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_stop_gcp_spot --gcp
-  if: build.env("gcp") == "1"
-  label: test_stop_gcp_spot on gcp
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_multinode_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_container_logs_multinode_kubernetes on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_gcp_storage_mounts_with_stop
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_storage_mounts_with_stop on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_resources --aws
-  if: build.env("aws") == "1"
-  label: test_multiple_resources on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::test_file_mounts --aws
-  if: build.env("aws") == "1"
-  label: test_file_mounts on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_vm_pod --gcp
-  if: build.env("gcp") == "1"
-  label: test_tpu_vm_pod on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_excluded_file_cloud_storage_upload_copy
-    on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_pipeline_failed_setup
-    --aws
-  if: build.env("aws") == "1"
-  label: test_managed_jobs_pipeline_failed_setup on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_http_server_with_custom_ports
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_http_server_with_custom_ports on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_gcp_images --gcp
-  if: build.env("gcp") == "1"
-  label: test_gcp_images on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_streaming --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_streaming on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_azure_http --azure
-  if: build.env("azure") == "1"
-  label: test_skyserve_azure_http on azure
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_upload_source_with_spaces
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_upload_source_with_spaces on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_mount_and_storage.py::TestStorageWithCredentials::test_nonexistent_bucket
-    --aws
-  if: build.env("aws") == "1"
-  label: TestStorageWithCredentials::test_nonexistent_bucket on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_fast_update --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_fast_update on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_task_labels_aws --aws
-  if: build.env("aws") == "1"
-  label: test_task_labels_aws on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered
-    --aws
-  if: build.env("aws") == "1"
-  label: test_multiple_accelerators_unordered on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_azure_region --azure
-  if: build.env("azure") == "1"
-  label: test_azure_region on azure
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_container_logs_two_jobs_kubernetes
-    --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_container_logs_two_jobs_kubernetes on kubernetes
-- agents:
-    queue: kubernetes
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_tpu_pod_slice_gke --kubernetes
-  if: build.env("kubernetes") == "1"
-  label: test_tpu_pod_slice_gke on kubernetes
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_core_api_sky_launch_exec --gcp
-  if: build.env("gcp") == "1"
-  label: test_core_api_sky_launch_exec on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_region_and_zone.py::test_aws_with_ssh_proxy_command
-    --aws
-  if: build.env("aws") == "1"
-  label: test_aws_with_ssh_proxy_command on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_custom_image --aws
-  if: build.env("aws") == "1"
-  label: test_aws_custom_image on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_long_setup_run_script
-    --aws
-  if: build.env("aws") == "1"
-  label: test_long_setup_run_script on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_images.py::test_clone_disk_gcp --gcp
-  if: build.env("gcp") == "1"
-  label: test_clone_disk_gcp on gcp
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_basic.py::test_multiple_accelerators_unordered_with_default
-    --aws
-  if: build.env("aws") == "1"
-  label: test_multiple_accelerators_unordered_with_default on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_cluster_job.py::test_aws_disk_tier --aws
-  if: build.env("aws") == "1"
-  label: test_aws_disk_tier on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_sky_serve.py::test_skyserve_large_readiness_timeout
-    --aws
-  if: build.env("aws") == "1"
-  label: test_skyserve_large_readiness_timeout on aws
-- agents:
-    queue: generic_cloud
-  command: pytest tests/smoke_tests/test_managed_job.py::test_managed_jobs_recovery_multi_node_gcp
-    --gcp
-  if: build.env("gcp") == "1"
-  label: test_managed_jobs_recovery_multi_node_gcp on gcp
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db40b03b5fa..25fab5b468a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,7 +56,7 @@ repos:
     hooks:
     -   id: yapf
         name: yapf
-        exclude: (build/.*|sky/skylet/providers/ibm/.*)  # Matches exclusions from the script
+        exclude: (sky/skylet/providers/ibm/.*)  # Matches exclusions from the script
         args: ['--recursive', '--parallel']  # Only necessary flags
         additional_dependencies: [toml==0.10.2]
 

From b14a655b67311cd730b10fbe1e484c4d7f1587a3 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Sat, 30 Nov 2024 13:50:36 +0800
Subject: [PATCH 47/64] fix pre-commit

---
 .buildkite/generate_pipeline.py | 39 ++++++++++++++-------------------
 .pre-commit-config.yaml         |  2 +-
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index bd01bcf6bb4..f0b85263551 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -24,6 +24,7 @@
 import copy
 import os
 import random
+
 from typing import Any, Dict, List, Optional
 
 import yaml
@@ -46,11 +47,9 @@
     'kubernetes': QUEUE_KUBERNETES
 }
 
-GENERATED_FILE_HEAD = (
-    '# This is an auto-generated Buildkite pipeline by '
-    '.buildkite/generate_pipeline.py, Please do not '
-    'edit directly.\n'
-)
+GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by '
+                       '.buildkite/generate_pipeline.py, Please do not '
+                       'edit directly.\n')
 
 
 def _get_full_decorator_path(decorator: ast.AST) -> str:
@@ -107,8 +106,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                 if cloud not in clouds_to_exclude
             ]
             final_clouds_to_include = [
-                cloud for cloud in clouds_to_include
-                if cloud in CLOUD_QUEUE_MAP
+                cloud for cloud in clouds_to_include if cloud in CLOUD_QUEUE_MAP
             ]
             if clouds_to_include and not final_clouds_to_include:
                 print(f'Warning: {file_path}:{node.name} '
@@ -122,7 +120,8 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
     return function_cloud_map
 
 
-def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dict[str, Any]:
+def _generate_pipeline(test_file: str,
+                       one_cloud_per_test_function: bool) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
     function_cloud_map = _extract_marked_tests(test_file)
@@ -144,13 +143,11 @@ def _generate_pipeline(test_file: str, one_cloud_per_test_function: bool) -> Dic
     return {'steps': steps}
 
 
-def _dump_pipeline_to_file(
-        output_file_pipelines_map: Dict[str, List[Dict[str, Any]]],
-        extra_env: Optional[Dict[str, str]] = None):
-    default_env = {
-        'LOG_TO_STDOUT': '1',
-        'PYTHONPATH': '${PYTHONPATH}:$(pwd)'
-    }
+def _dump_pipeline_to_file(output_file_pipelines_map: Dict[str,
+                                                           List[Dict[str,
+                                                                     Any]]],
+                           extra_env: Optional[Dict[str, str]] = None):
+    default_env = {'LOG_TO_STDOUT': '1', 'PYTHONPATH': '${PYTHONPATH}:$(pwd)'}
     if extra_env:
         default_env.update(extra_env)
 
@@ -163,12 +160,10 @@ def _dump_pipeline_to_file(
             # Shuffle the steps to avoid flakyness, consecutive runs of the same
             # kind of test may fail for requiring locks on the same resources.
             random.shuffle(all_steps)
-            final_pipeline = {
-                'steps': all_steps,
-                'env': default_env
-            }
+            final_pipeline = {'steps': all_steps, 'env': default_env}
             yaml.dump(final_pipeline, file, default_flow_style=False)
 
+
 def _convert_release(test_files: List[str]):
     yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
     output_file_pipelines_map = defaultdict(list)
@@ -179,9 +174,8 @@ def _convert_release(test_files: List[str]):
         output_file_pipelines_map[yaml_file_path].append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
-    _dump_pipeline_to_file(output_file_pipelines_map, extra_env={
-        cloud: '1' for cloud in CLOUD_QUEUE_MAP
-    })
+    _dump_pipeline_to_file(output_file_pipelines_map,
+                           extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP})
 
 
 def _convert_pre_merge(test_files: List[str]):
@@ -197,6 +191,7 @@ def _convert_pre_merge(test_files: List[str]):
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     _dump_pipeline_to_file(output_file_pipelines_map)
 
+
 def main():
     test_files = os.listdir('tests/smoke_tests')
     release_files = []
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25fab5b468a..a1e6cc4a7dd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,7 +57,7 @@ repos:
     -   id: yapf
         name: yapf
         exclude: (sky/skylet/providers/ibm/.*)  # Matches exclusions from the script
-        args: ['--recursive', '--parallel']  # Only necessary flags
+        args: ['--recursive', '--parallel', '--in-place']  # Only necessary flags
         additional_dependencies: [toml==0.10.2]
 
 -   repo: https://github.com/pylint-dev/pylint

From f4a1b366d9d062f4fb261e2f66028b17b2a81963 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Sat, 30 Nov 2024 19:25:59 +0800
Subject: [PATCH 48/64] format

---
 .buildkite/generate_pipeline.py | 1 -
 .pre-commit-config.yaml         | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index f0b85263551..0c3703e468d 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -24,7 +24,6 @@
 import copy
 import os
 import random
-
 from typing import Any, Dict, List, Optional
 
 import yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a1e6cc4a7dd..81f794dac24 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,6 @@ repos:
         args:
           - "--sg=build/**"  # Matches "${ISORT_YAPF_EXCLUDES[@]}"
           - "--sg=sky/skylet/providers/ibm/**"
-        files: "^(sky|tests|examples|llm|docs)/.*"  # Only match these directories
     # Second isort command
     -   id: isort
         name: isort (IBM specific)

From 60c9290790885fdadfc3503831527ea5130de12d Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 2 Dec 2024 12:17:41 +0800
Subject: [PATCH 49/64] support SUPPRESS_SENSITIVE_LOG

---
 .buildkite/generate_pipeline.py | 38 +++++++++++++++++++++++++++------
 tests/smoke_tests/util.py       | 11 ++++++++++
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 0c3703e468d..8b2cf65e8b9 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -7,6 +7,10 @@
 ├── test_*.py -> release pipeline
 ├── test_required_before_merge.py -> pre-merge pipeline
 
+run `python .buildkite/generate_pipeline.py` to generate the pipeline for
+testing. The CI will run this script as a pre-step, and use the generated
+pipeline to run the tests.
+
 1. release pipeline, which runs all smoke tests by default, some function
    support tests by multiple clouds, but we only generate one cloud per test
    function to save cost.
@@ -36,7 +40,9 @@
     'lambda_cloud'
 ]
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
+QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
 QUEUE_KUBERNETES = 'kubernetes'
+QUEUE_KUBERNETES_SERVE = 'kubernetes_serve'
 # Only aws, gcp, azure, and kubernetes are supported for now.
 # Other clouds do not have credentials.
 CLOUD_QUEUE_MAP = {
@@ -45,6 +51,15 @@
     'azure': QUEUE_GENERIC_CLOUD,
     'kubernetes': QUEUE_KUBERNETES
 }
+# Serve tests runs long, and different test steps usually requires locks.
+# Its highly likely to fail if multiple serve tests are running concurrently.
+# So we use a different queue that runs only one concurrent test at a time.
+SERVE_CLOUD_QUEUE_MAP = {
+    'aws': QUEUE_GENERIC_CLOUD_SERVE,
+    'gcp': QUEUE_GENERIC_CLOUD_SERVE,
+    'azure': QUEUE_GENERIC_CLOUD_SERVE,
+    'kubernetes': QUEUE_KUBERNETES_SERVE
+}
 
 GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by '
                        '.buildkite/generate_pipeline.py, Please do not '
@@ -82,6 +97,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
 
             clouds_to_include = []
             clouds_to_exclude = []
+            is_serve_test = False
             for decorator in node.decorator_list:
                 if isinstance(decorator, ast.Call):
                     # We only need to consider the decorator with no arguments
@@ -94,6 +110,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                     if suffix.startswith('no_'):
                         clouds_to_exclude.append(suffix[3:])
                     else:
+                        if suffix == 'serve':
+                            is_serve_test = True
+                            continue
                         if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS:
                             # This mark does not specify a cloud, so we skip it.
                             continue
@@ -104,8 +123,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                 cloud for cloud in clouds_to_include
                 if cloud not in clouds_to_exclude
             ]
+            cloud_queue_map = SERVE_CLOUD_QUEUE_MAP if is_serve_test else CLOUD_QUEUE_MAP
             final_clouds_to_include = [
-                cloud for cloud in clouds_to_include if cloud in CLOUD_QUEUE_MAP
+                cloud for cloud in clouds_to_include if cloud in cloud_queue_map
             ]
             if clouds_to_include and not final_clouds_to_include:
                 print(f'Warning: {file_path}:{node.name} '
@@ -115,7 +135,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                 continue
             function_name = (f'{class_name}::{node.name}'
                              if class_name else node.name)
-            function_cloud_map[function_name] = (clouds_to_include)
+            function_cloud_map[function_name] = (final_clouds_to_include, [
+                cloud_queue_map[cloud] for cloud in final_clouds_to_include
+            ])
     return function_cloud_map
 
 
@@ -124,15 +146,16 @@ def _generate_pipeline(test_file: str,
     """Generate a Buildkite pipeline from test files."""
     steps = []
     function_cloud_map = _extract_marked_tests(test_file)
-    for test_function, clouds in function_cloud_map.items():
-        for cloud in clouds:
+    for test_function, clouds_and_queues in function_cloud_map.items():
+        for cloud, queue in zip(*clouds_and_queues):
             step = {
                 'label': f'{test_function} on {cloud}',
                 'command': f'pytest {test_file}::{test_function} --{cloud}',
                 'agents': {
                     # Separate agent pool for each cloud.
-                    # Since some are more costly
-                    'queue': CLOUD_QUEUE_MAP[cloud]
+                    # Since they require different amount of resources and
+                    # concurrency control.
+                    'queue': queue
                 },
                 'if': f'build.env("{cloud}") == "1"'
             }
@@ -188,7 +211,8 @@ def _convert_pre_merge(test_files: List[str]):
         pipeline = _generate_pipeline(test_file, False)
         output_file_pipelines_map[yaml_file_path].append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
-    _dump_pipeline_to_file(output_file_pipelines_map)
+    _dump_pipeline_to_file(output_file_pipelines_map,
+                           extra_env={'SUPPRESS_SENSITIVE_LOG': '1'})
 
 
 def main():
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index 413b238703c..2675bb0e35a 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -1,5 +1,6 @@
 import enum
 import inspect
+import logging
 import os
 import subprocess
 import sys
@@ -12,6 +13,7 @@
 
 import sky
 from sky import serve
+from sky import sky_logging
 from sky.clouds import AWS
 from sky.clouds import GCP
 from sky.jobs.state import ManagedJobStatus
@@ -57,6 +59,15 @@
 _ALL_MANAGED_JOB_STATUSES = "|".join(
     [status.value for status in ManagedJobStatus])
 
+# Suppress the sensitive log in smoke tests.
+SUPPRESS_SENSITIVE_LOG = os.environ.get('SUPPRESS_SENSITIVE_LOG', None)
+if SUPPRESS_SENSITIVE_LOG:
+    provisioner_logger = sky_logging.init_logger('sky.provisioner')
+    optimizer_logger = sky_logging.init_logger('sky.optimizer')
+    # Do not print the debug logs.
+    provisioner_logger.setLevel(logging.INFO)
+    optimizer_logger.setLevel(logging.INFO)
+
 
 def _statuses_to_str(statuses: List[enum.Enum]):
     """Convert a list of enums to a string with all the values separated by |."""

From b22afff1aab2f2c186423efc362ac734595603ef Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 2 Dec 2024 13:03:30 +0800
Subject: [PATCH 50/64] support env SKYPILOT_SUPPRESS_SENSITIVE_LOG to suppress
 debug log

---
 .buildkite/generate_pipeline.py |  2 +-
 sky/sky_logging.py              | 13 +++++++++++++
 sky/utils/env_options.py        |  1 +
 tests/smoke_tests/util.py       |  9 ---------
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 8b2cf65e8b9..6c5a3b0d21e 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -212,7 +212,7 @@ def _convert_pre_merge(test_files: List[str]):
         output_file_pipelines_map[yaml_file_path].append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     _dump_pipeline_to_file(output_file_pipelines_map,
-                           extra_env={'SUPPRESS_SENSITIVE_LOG': '1'})
+                           extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
 
 
 def main():
diff --git a/sky/sky_logging.py b/sky/sky_logging.py
index 75dc836a49e..f76f5a31b94 100644
--- a/sky/sky_logging.py
+++ b/sky/sky_logging.py
@@ -15,6 +15,7 @@
                         not env_options.Options.MINIMIZE_LOGGING.get())
 _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
 _DATE_FORMAT = '%m-%d %H:%M:%S'
+_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
 
 
 class NewLineFormatter(logging.Formatter):
@@ -75,6 +76,18 @@ def _setup_logger():
     # Setting this will avoid the message
     # being propagated to the parent logger.
     _root_logger.propagate = False
+    if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
+        # If the sensitive log is enabled, we force set the level to INFO
+        # to suppress the debug logs for certain loggers.
+        # Do not propagate to the parent logger to avoid parent
+        # logger printing the logs.
+        for logger_name in _SENSITIVE_LOGGER:
+            logger = logging.getLogger(logger_name)
+            handler_to_logger = RichSafeStreamHandler(sys.stdout)
+            handler_to_logger.flush = sys.stdout.flush  # type: ignore
+            logger.addHandler(handler_to_logger)
+            logger.setLevel(logging.INFO)
+            logger.propagate = False
 
 
 def reload_logger():
diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py
index ebec8eeb90d..cfc20a76253 100644
--- a/sky/utils/env_options.py
+++ b/sky/utils/env_options.py
@@ -11,6 +11,7 @@ class Options(enum.Enum):
     SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
     DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
     MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
+    SUPPRESS_SENSITIVE_LOG = ('SKYPILOT_SUPPRESS_SENSITIVE_LOG', False)
     # Internal: this is used to skip the cloud user identity check, which is
     # used to protect cluster operations in a multi-identity scenario.
     # Currently, this is only used in the job and serve controller, as there
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/util.py
index 2675bb0e35a..0c583d828be 100644
--- a/tests/smoke_tests/util.py
+++ b/tests/smoke_tests/util.py
@@ -59,15 +59,6 @@
 _ALL_MANAGED_JOB_STATUSES = "|".join(
     [status.value for status in ManagedJobStatus])
 
-# Suppress the sensitive log in smoke tests.
-SUPPRESS_SENSITIVE_LOG = os.environ.get('SUPPRESS_SENSITIVE_LOG', None)
-if SUPPRESS_SENSITIVE_LOG:
-    provisioner_logger = sky_logging.init_logger('sky.provisioner')
-    optimizer_logger = sky_logging.init_logger('sky.optimizer')
-    # Do not print the debug logs.
-    provisioner_logger.setLevel(logging.INFO)
-    optimizer_logger.setLevel(logging.INFO)
-
 
 def _statuses_to_str(statuses: List[enum.Enum]):
     """Convert a list of enums to a string with all the values separated by |."""

From def4eb7ba17ef1acec43681fc495736c32801ecb Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 2 Dec 2024 13:21:20 +0800
Subject: [PATCH 51/64] support env SKYPILOT_SUPPRESS_SENSITIVE_LOG to suppress
 debug log

---
 sky/sky_logging.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/sky/sky_logging.py b/sky/sky_logging.py
index f76f5a31b94..944cbcf46d4 100644
--- a/sky/sky_logging.py
+++ b/sky/sky_logging.py
@@ -77,16 +77,21 @@ def _setup_logger():
     # being propagated to the parent logger.
     _root_logger.propagate = False
     if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
-        # If the sensitive log is enabled, we force set the level to INFO
-        # to suppress the debug logs for certain loggers.
-        # Do not propagate to the parent logger to avoid parent
-        # logger printing the logs.
+        # If the sensitive log is enabled, we re init a new handler
+        # and force set the level to INFO to suppress the debug logs
+        # for certain loggers.
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
             handler_to_logger = RichSafeStreamHandler(sys.stdout)
             handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)
+            if _show_logging_prefix:
+                handler_to_logger.setFormatter(FORMATTER)
+            else:
+                handler_to_logger.setFormatter(NO_PREFIX_FORMATTER)
+            # Do not propagate to the parent logger to avoid parent
+            # logger printing the logs.
             logger.propagate = False
 
 

From bef1cf1c580093914720d80a9747fbc8c88027e3 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 4 Dec 2024 16:57:15 +0800
Subject: [PATCH 52/64] add backward_compatibility_tests to pipeline

---
 .buildkite/generate_pipeline.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 6c5a3b0d21e..f2edae5dfca 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -209,6 +209,14 @@ def _convert_pre_merge(test_files: List[str]):
         # for pre-merge. And let the author controls which clouds
         # to run by parameter.
         pipeline = _generate_pipeline(test_file, False)
+        pipeline['steps'].append({
+            'label': 'Backward compatibility test',
+            'command': 'bash tests/backward_compatibility_tests.sh',
+            'agents': {
+                'queue': 'back_compat'
+            },
+            'if': 'build.env("aws") == "1"'
+        })
         output_file_pipelines_map[yaml_file_path].append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     _dump_pipeline_to_file(output_file_pipelines_map,

From cd64c4c4a43b65fd54a8d5ac1eabf575c2dae02c Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 4 Dec 2024 17:41:21 +0800
Subject: [PATCH 53/64] pip install uv for backward compatibility test

---
 tests/backward_compatibility_tests.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh
index 696b87ff6ad..511b2c9ba6b 100644
--- a/tests/backward_compatibility_tests.sh
+++ b/tests/backward_compatibility_tests.sh
@@ -35,7 +35,8 @@ rm -r  ~/.sky/wheels || true
 cd ../sky-master
 git pull origin master
 pip uninstall -y skypilot
-pip install -e ".[all]"
+pip install uv
+uv pip install -e ".[all]"
 cd -
 
 conda env list | grep sky-back-compat-current || conda create -n sky-back-compat-current -y python=3.9
@@ -43,7 +44,8 @@ conda activate sky-back-compat-current
 conda install -c conda-forge google-cloud-sdk -y
 rm -r  ~/.sky/wheels || true
 pip uninstall -y skypilot
-pip install -e ".[all]"
+pip install uv
+uv pip install -e ".[all]"
 
 
 # exec + launch

From cd4d6e13a787d9e4449516119bcdf18c011d49fa Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 5 Dec 2024 14:20:43 +0800
Subject: [PATCH 54/64] import style

---
 tests/smoke_tests/__init__.py                 |   2 +-
 .../{util.py => smoke_tests_utils.py}         |   0
 tests/smoke_tests/test_basic.py               | 145 +++---
 tests/smoke_tests/test_cluster_job.py         | 430 +++++++++---------
 tests/smoke_tests/test_images.py              | 110 +++--
 tests/smoke_tests/test_managed_job.py         | 319 +++++++------
 tests/smoke_tests/test_mount_and_storage.py   |  97 ++--
 tests/smoke_tests/test_region_and_zone.py     |  57 ++-
 .../smoke_tests/test_required_before_merge.py |  13 +-
 tests/smoke_tests/test_sky_serve.py           | 102 ++---
 10 files changed, 634 insertions(+), 641 deletions(-)
 rename tests/smoke_tests/{util.py => smoke_tests_utils.py} (100%)

diff --git a/tests/smoke_tests/__init__.py b/tests/smoke_tests/__init__.py
index 7f91740c201..63d4cd2b811 100644
--- a/tests/smoke_tests/__init__.py
+++ b/tests/smoke_tests/__init__.py
@@ -1,2 +1,2 @@
 """For smoke tests import."""
-__all__ = ['util']
+__all__ = ['smoke_tests_utils']
diff --git a/tests/smoke_tests/util.py b/tests/smoke_tests/smoke_tests_utils.py
similarity index 100%
rename from tests/smoke_tests/util.py
rename to tests/smoke_tests/smoke_tests_utils.py
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index e5ab315434b..e8dffe53846 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -26,16 +26,7 @@
 import time
 
 import pytest
-from smoke_tests.util import BUMP_UP_SECONDS
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
-from smoke_tests.util import (
-    get_cmd_wait_until_job_status_contains_without_matching_job)
-from smoke_tests.util import get_timeout
-from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import Test
-from smoke_tests.util import VALIDATE_LAUNCH_OUTPUT
+from smoke_tests import smoke_tests_utils
 
 import sky
 from sky.skylet import events
@@ -45,25 +36,25 @@
 # ---------- Dry run: 2 Tasks in a chain. ----------
 @pytest.mark.no_fluidstack  #requires GCP and AWS set up
 def test_example_app():
-    test = Test(
+    test = smoke_tests_utils.Test(
         'example_app',
         ['python examples/example_app.py'],
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- A minimal task ----------
 def test_minimal(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'minimal',
         [
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
             # Output validation done.
             f'sky logs {name} 1 --status',
             f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
             # Test launch output again on existing cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 2 --status',
             f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
             # Check the logs downloading
@@ -89,20 +80,20 @@ def test_minimal(generic_cloud: str):
             f'sky exec -c {name} && exit 1 || true',
         ],
         f'sky down -y {name}',
-        get_timeout(generic_cloud),
+        smoke_tests_utils.get_timeout(generic_cloud),
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Test fast launch ----------
 def test_launch_fast(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         'test_launch_fast',
         [
             # First launch to create the cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 1 --status',
 
             # Second launch to test fast launch - should not reprovision
@@ -118,9 +109,9 @@ def test_launch_fast(generic_cloud: str):
             f'sky status -r {name} | grep UP',
         ],
         f'sky down -y {name}',
-        timeout=get_timeout(generic_cloud),
+        timeout=smoke_tests_utils.get_timeout(generic_cloud),
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # See cloud exclusion explanations in test_autostop
@@ -129,35 +120,35 @@ def test_launch_fast(generic_cloud: str):
 @pytest.mark.no_ibm
 @pytest.mark.no_kubernetes
 def test_launch_fast_with_autostop(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
     # the VM is stopped.
     autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    test = Test(
+    test = smoke_tests_utils.Test(
         'test_launch_fast_with_autostop',
         [
             # First launch to create the cluster with a short autostop
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 1 --status',
             f'sky status -r {name} | grep UP',
 
             # Ensure cluster is stopped
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
             # Even the cluster is stopped, cloud platform may take a while to
             # delete the VM.
-            f'sleep {BUMP_UP_SECONDS}',
+            f'sleep {smoke_tests_utils.BUMP_UP_SECONDS}',
             # Launch again. Do full output validation - we expect the cluster to re-launch
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {VALIDATE_LAUNCH_OUTPUT}',
+            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
             f'sky logs {name} 2 --status',
             f'sky status -r {name} | grep UP',
         ],
         f'sky down -y {name}',
-        timeout=get_timeout(generic_cloud) + autostop_timeout,
+        timeout=smoke_tests_utils.get_timeout(generic_cloud) + autostop_timeout,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ------------ Test stale job ------------
@@ -165,14 +156,14 @@ def test_launch_fast_with_autostop(generic_cloud: str):
 @pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
 @pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
 def test_stale_job(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'stale_job',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=100),
@@ -182,16 +173,16 @@ def test_stale_job(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_aws_stale_job_manual_restart():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, sky.AWS.max_cluster_name_length())
     region = 'us-east-2'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'aws_stale_job_manual_restart',
         [
             f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"',
@@ -203,7 +194,7 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=40),
@@ -211,6 +202,7 @@ def test_aws_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
+            smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
                 job_status=[sky.JobStatus.FAILED_DRIVER],
@@ -218,12 +210,12 @@ def test_aws_stale_job_manual_restart():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_stale_job_manual_restart():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, sky.GCP.max_cluster_name_length())
     zone = 'us-west2-a'
@@ -232,7 +224,7 @@ def test_gcp_stale_job_manual_restart():
                  f'--zones={zone} --format="value(name)"')
     stop_cmd = (f'gcloud compute instances stop --zone={zone}'
                 f' --quiet $({query_cmd})')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'gcp_stale_job_manual_restart',
         [
             f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"',
@@ -244,6 +236,7 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
+            smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
                 job_status=[sky.JobStatus.FAILED_DRIVER],
@@ -251,16 +244,16 @@ def test_gcp_stale_job_manual_restart():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Check Sky's environment variables; workdir. ----------
 @pytest.mark.no_fluidstack  # Requires amazon S3
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_env_check(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = Test(
+    test = smoke_tests_utils.Test(
         'env_check',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
@@ -269,19 +262,19 @@ def test_env_check(generic_cloud: str):
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- CLI logs ----------
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead.
 def test_cli_logs(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     num_nodes = 2
     if generic_cloud == 'kubernetes':
         # Kubernetes does not support multi-node
         num_nodes = 1
     timestamp = time.time()
-    test = Test('cli_logs', [
+    test = smoke_tests_utils.Test('cli_logs', [
         f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"',
         f'sky exec {name} "echo {timestamp} 2"',
         f'sky exec {name} "echo {timestamp} 3"',
@@ -292,17 +285,17 @@ def test_cli_logs(generic_cloud: str):
         f'sky logs {name} 1 | grep "{timestamp} 1"',
         f'sky logs {name} | grep "{timestamp} 4"',
     ], f'sky down -y {name}')
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.scp
 def test_scp_logs():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     timestamp = time.time()
-    test = Test(
+    test = smoke_tests_utils.Test(
         'SCP_cli_logs',
         [
-            f'sky launch -y -c {name} {SCP_TYPE} "echo {timestamp} 1"',
+            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} "echo {timestamp} 1"',
             f'sky exec {name} "echo {timestamp} 2"',
             f'sky exec {name} "echo {timestamp} 3"',
             f'sky exec {name} "echo {timestamp} 4"',
@@ -314,7 +307,7 @@ def test_scp_logs():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ------- Testing the core API --------
@@ -324,7 +317,7 @@ def test_scp_logs():
 
 @pytest.mark.gcp
 def test_core_api_sky_launch_exec():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     task = sky.Task(run="whoami")
     task.set_resources(sky.Resources(cloud=sky.GCP()))
     job_id, handle = sky.launch(task, cluster_name=name)
@@ -347,7 +340,7 @@ def test_core_api_sky_launch_exec():
 # The sky launch CLI has some additional checks to make sure the cluster is up/
 # restarted. However, the core API doesn't have these; make sure it still works
 def test_core_api_sky_launch_fast(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)
     try:
         task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud))
@@ -356,7 +349,7 @@ def test_core_api_sky_launch_fast(generic_cloud: str):
                    idle_minutes_to_autostop=1,
                    fast=True)
         # Sleep to let the cluster autostop
-        get_cmd_wait_until_cluster_status_contains(
+        smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
             cluster_name=name,
             cluster_status=[sky.ClusterStatus.STOPPED],
             timeout=120)
@@ -430,8 +423,8 @@ def test_load_dump_yaml_config_equivalent(self):
 @pytest.mark.no_fluidstack  # Fluidstack does not support K80 gpus for now
 @pytest.mark.no_paperspace  # Paperspace does not support K80 gpus
 def test_multiple_accelerators_ordered():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multiple-accelerators-ordered',
         [
             f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"',
@@ -440,14 +433,14 @@ def test_multiple_accelerators_ordered():
         f'sky down -y {name}',
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
 @pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
 def test_multiple_accelerators_ordered_with_default():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multiple-accelerators-ordered',
         [
             f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"',
@@ -456,14 +449,14 @@ def test_multiple_accelerators_ordered_with_default():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
 @pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
 def test_multiple_accelerators_unordered():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multiple-accelerators-unordered',
         [
             f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml',
@@ -471,14 +464,14 @@ def test_multiple_accelerators_unordered():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
 @pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
 def test_multiple_accelerators_unordered_with_default():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multiple-accelerators-unordered-with-default',
         [
             f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml',
@@ -487,13 +480,13 @@ def test_multiple_accelerators_unordered_with_default():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Requires other clouds to be enabled
 def test_multiple_resources():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multiple-resources',
         [
             f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml',
@@ -501,7 +494,7 @@ def test_multiple_resources():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Sky Benchmark ----------
@@ -510,8 +503,8 @@ def test_multiple_resources():
 @pytest.mark.no_kubernetes
 @pytest.mark.aws  # SkyBenchmark requires S3 access
 def test_sky_bench(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'sky-bench',
         [
             f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml',
@@ -520,7 +513,7 @@ def test_sky_bench(generic_cloud: str):
         ],
         f'sky bench down {name} -y; sky bench delete {name} -y',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
@@ -558,8 +551,8 @@ def test_kubernetes_context_failover():
     with tempfile.NamedTemporaryFile(delete=True) as f:
         f.write(config.encode('utf-8'))
         f.flush()
-        name = get_cluster_name()
-        test = Test(
+        name = smoke_tests_utils.get_cluster_name()
+        test = smoke_tests_utils.Test(
             'kubernetes-context-failover',
             [
                 # Check if kind-skypilot is provisioned with H100 annotations already
@@ -606,4 +599,4 @@ def test_kubernetes_context_failover():
             f'sky down -y {name}-1 {name}-3',
             env={'SKYPILOT_CONFIG': f.name},
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index 8b97ab4eef9..0255884ae30 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -25,19 +25,7 @@
 
 import jinja2
 import pytest
-from smoke_tests.util import BUMP_UP_SECONDS
-from smoke_tests.util import get_aws_region_for_quota_failover
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
-from smoke_tests.util import (
-    get_cmd_wait_until_job_status_contains_matching_job_id)
-from smoke_tests.util import get_gcp_region_for_quota_failover
-from smoke_tests.util import get_timeout
-from smoke_tests.util import LAMBDA_TYPE
-from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_GPU_V100
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import Test
+from smoke_tests import smoke_tests_utils
 
 import sky
 from sky import AWS
@@ -56,8 +44,8 @@
 @pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
 @pytest.mark.no_oci  # OCI does not have T4 gpus
 def test_job_queue(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'job_queue',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml',
@@ -78,7 +66,7 @@ def test_job_queue(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Job Queue with Docker. ----------
@@ -106,10 +94,10 @@ def test_job_queue(generic_cloud: str):
         'docker:winglian/axolotl:main-latest'
     ])
 def test_job_queue_with_docker(generic_cloud: str, image_id: str):
-    name = get_cluster_name() + image_id[len('docker:'):][:4]
+    name = smoke_tests_utils.get_cluster_name() + image_id[len('docker:'):][:4]
     total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
     time_to_sleep = 300 if generic_cloud == 'azure' else 180
-    test = Test(
+    test = smoke_tests_utils.Test(
         'job_queue_with_docker',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
@@ -145,16 +133,16 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.lambda_cloud
 def test_lambda_job_queue():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'lambda_job_queue',
         [
-            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/job_queue/cluster.yaml',
+            f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/job_queue/cluster.yaml',
             f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml',
             f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml',
             f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml',
@@ -168,13 +156,13 @@ def test_lambda_job_queue():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.ibm
 def test_ibm_job_queue():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'ibm_job_queue',
         [
             f'sky launch -y -c {name} --cloud ibm --gpus v100',
@@ -191,21 +179,21 @@ def test_ibm_job_queue():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.scp
 def test_scp_job_queue():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     num_of_gpu_launch = 1
     num_of_gpu_exec = 0.5
-    test = Test(
+    test = smoke_tests_utils.Test(
         'SCP_job_queue',
         [
-            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 {SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml',
+            f'sky exec {name} -n {name}-1 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-2 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-3 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
             f'sky queue {name} | grep {name}-1 | grep RUNNING',
             f'sky queue {name} | grep {name}-2 | grep RUNNING',
             f'sky queue {name} | grep {name}-3 | grep PENDING',
@@ -216,7 +204,7 @@ def test_scp_job_queue():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
@@ -227,9 +215,9 @@ def test_scp_job_queue():
 @pytest.mark.no_oci  # OCI Cloud does not have T4 gpus.
 @pytest.mark.no_kubernetes  # Kubernetes not support num_nodes > 1 yet
 def test_job_queue_multinode(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
-    test = Test(
+    test = smoke_tests_utils.Test(
         'job_queue_multinode',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml',
@@ -261,14 +249,14 @@ def test_job_queue_multinode(generic_cloud: str):
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
 @pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
 def test_large_job_queue(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'large_job_queue',
         [
             f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
@@ -306,15 +294,15 @@ def test_large_job_queue(generic_cloud: str):
         f'sky down -y {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
 @pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
 def test_fast_large_job_queue(generic_cloud: str):
     # This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'fast_large_job_queue',
         [
             f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
@@ -325,14 +313,14 @@ def test_fast_large_job_queue(generic_cloud: str):
         f'sky down -y {name}',
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.ibm
 def test_ibm_job_queue_multinode():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     task_file = 'examples/job_queue/job_multinode_ibm.yaml'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'ibm_job_queue_multinode',
         [
             f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2',
@@ -364,7 +352,7 @@ def test_ibm_job_queue_multinode():
         f'sky down -y {name}',
         timeout=20 * 60,  # 20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Docker with preinstalled package. ----------
@@ -376,8 +364,8 @@ def test_ibm_job_queue_multinode():
 @pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
 # TODO(zhwu): we should fix this for kubernetes
 def test_docker_preinstalled_package(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'docker_with_preinstalled_package',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx',
@@ -387,7 +375,7 @@ def test_docker_preinstalled_package(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Submitting multiple tasks to the same cluster. ----------
@@ -398,8 +386,8 @@ def test_docker_preinstalled_package(generic_cloud: str):
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 @pytest.mark.no_oci  # OCI Cloud does not have T4 gpus
 def test_multi_echo(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multi_echo',
         [
             f'python examples/multi_echo.py {name} {generic_cloud}',
@@ -418,6 +406,7 @@ def test_multi_echo(generic_cloud: str):
         ] +
         # Ensure jobs succeeded.
         [
+            smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
                 job_id=i + 1,
@@ -430,7 +419,7 @@ def test_multi_echo(generic_cloud: str):
         f'sky down -y {name}',
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Task: 1 node training. ----------
@@ -439,8 +428,8 @@ def test_multi_echo(generic_cloud: str):
 @pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
 @pytest.mark.no_scp  # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
 def test_huggingface(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'huggingface_glue_imdb_app',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml',
@@ -450,47 +439,47 @@ def test_huggingface(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.lambda_cloud
 def test_lambda_huggingface(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'lambda_huggingface_glue_imdb_app',
         [
-            f'sky launch -y -c {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
+            f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
+            f'sky exec {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.scp
 def test_scp_huggingface(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     num_of_gpu_launch = 1
-    test = Test(
+    test = smoke_tests_utils.Test(
         'SCP_huggingface_glue_imdb_app',
         [
-            f'sky launch -y -c {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
+            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {SCP_TYPE} {SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
+            f'sky exec {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Inferentia. ----------
 @pytest.mark.aws
 def test_inferentia():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test_inferentia',
         [
             f'sky launch -y -c {name} -t inf2.xlarge -- echo hi',
@@ -500,15 +489,15 @@ def test_inferentia():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- TPU. ----------
 @pytest.mark.gcp
 @pytest.mark.tpu
 def test_tpu():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'tpu_app',
         [
             f'sky launch -y -c {name} examples/tpu/tpu_app.yaml',
@@ -519,15 +508,15 @@ def test_tpu():
         f'sky down -y {name}',
         timeout=30 * 60,  # can take >20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- TPU VM. ----------
 @pytest.mark.gcp
 @pytest.mark.tpu
 def test_tpu_vm():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'tpu_vm_app',
         [
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml',
@@ -545,15 +534,15 @@ def test_tpu_vm():
         f'sky down -y {name}',
         timeout=30 * 60,  # can take 30 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- TPU VM Pod. ----------
 @pytest.mark.gcp
 @pytest.mark.tpu
 def test_tpu_vm_pod():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'tpu_pod',
         [
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a',
@@ -563,14 +552,14 @@ def test_tpu_vm_pod():
         f'sky down -y {name}',
         timeout=30 * 60,  # can take 30 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- TPU Pod Slice on GKE. ----------
 @pytest.mark.kubernetes
 def test_tpu_pod_slice_gke():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'tpu_pod_slice_gke',
         [
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
@@ -582,15 +571,15 @@ def test_tpu_pod_slice_gke():
         f'sky down -y {name}',
         timeout=30 * 60,  # can take 30 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Simple apps. ----------
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_multi_hostname(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = Test(
+    test = smoke_tests_utils.Test(
         'multi_hostname',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml',
@@ -600,15 +589,16 @@ def test_multi_hostname(generic_cloud: str):
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
         ],
         f'sky down -y {name}',
-        timeout=get_timeout(generic_cloud, total_timeout_minutes * 60),
+        timeout=smoke_tests_utils.get_timeout(generic_cloud,
+                                              total_timeout_minutes * 60),
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_multi_node_failure(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'multi_node_failure',
         [
             # TODO(zhwu): we use multi-thread to run the commands in setup
@@ -626,14 +616,14 @@ def test_multi_node_failure(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on GCP. ----------
 @pytest.mark.gcp
 def test_gcp_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml',
@@ -643,14 +633,14 @@ def test_gcp_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on AWS. ----------
 @pytest.mark.aws
 def test_aws_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml',
@@ -660,14 +650,14 @@ def test_aws_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on Azure. ----------
 @pytest.mark.azure
 def test_azure_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'azure_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml',
@@ -677,14 +667,14 @@ def test_azure_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on Kubernetes. ----------
 @pytest.mark.kubernetes
 def test_kubernetes_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'kubernetes_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml',
@@ -694,14 +684,14 @@ def test_kubernetes_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on Paperspace. ----------
 @pytest.mark.paperspace
 def test_paperspace_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'paperspace_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml',
@@ -711,14 +701,14 @@ def test_paperspace_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Web apps with custom ports on RunPod. ----------
 @pytest.mark.runpod
 def test_runpod_http_server_with_custom_ports():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'runpod_http_server_with_custom_ports',
         [
             f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml',
@@ -728,13 +718,13 @@ def test_runpod_http_server_with_custom_ports():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Labels from task on AWS (instance_tags) ----------
 @pytest.mark.aws
 def test_task_labels_aws():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     template_str = pathlib.Path(
         'tests/test_yamls/test_labels.yaml.j2').read_text()
     template = jinja2.Template(template_str)
@@ -743,7 +733,7 @@ def test_task_labels_aws():
         f.write(content)
         f.flush()
         file_path = f.name
-        test = Test(
+        test = smoke_tests_utils.Test(
             'task_labels_aws',
             [
                 f'sky launch -y -c {name} {file_path}',
@@ -758,13 +748,13 @@ def test_task_labels_aws():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Labels from task on GCP (labels) ----------
 @pytest.mark.gcp
 def test_task_labels_gcp():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     template_str = pathlib.Path(
         'tests/test_yamls/test_labels.yaml.j2').read_text()
     template = jinja2.Template(template_str)
@@ -773,7 +763,7 @@ def test_task_labels_gcp():
         f.write(content)
         f.flush()
         file_path = f.name
-        test = Test(
+        test = smoke_tests_utils.Test(
             'task_labels_gcp',
             [
                 f'sky launch -y -c {name} {file_path}',
@@ -785,13 +775,13 @@ def test_task_labels_gcp():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Labels from task on Kubernetes (labels) ----------
 @pytest.mark.kubernetes
 def test_task_labels_kubernetes():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     template_str = pathlib.Path(
         'tests/test_yamls/test_labels.yaml.j2').read_text()
     template = jinja2.Template(template_str)
@@ -800,7 +790,7 @@ def test_task_labels_kubernetes():
         f.write(content)
         f.flush()
         file_path = f.name
-        test = Test(
+        test = smoke_tests_utils.Test(
             'task_labels_kubernetes',
             [
                 f'sky launch -y -c {name} {file_path}',
@@ -813,14 +803,14 @@ def test_task_labels_kubernetes():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Pod Annotations on Kubernetes ----------
 @pytest.mark.kubernetes
 def test_add_pod_annotations_for_autodown_with_launch():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'add_pod_annotations_for_autodown_with_launch',
         [
             # Launch Kubernetes cluster with two nodes, each being head node and worker node.
@@ -838,13 +828,13 @@ def test_add_pod_annotations_for_autodown_with_launch():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
 def test_add_and_remove_pod_annotations_with_autostop():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'add_and_remove_pod_annotations_with_autostop',
         [
             # Launch Kubernetes cluster with two nodes, each being head node and worker node.
@@ -871,13 +861,13 @@ def test_add_and_remove_pod_annotations_with_autostop():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Container logs from task on Kubernetes ----------
 @pytest.mark.kubernetes
 def test_container_logs_multinode_kubernetes():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
     head_logs = ('kubectl get pods '
                  f' | grep {name} |  grep head | '
@@ -886,7 +876,7 @@ def test_container_logs_multinode_kubernetes():
                    f' | grep {name} |  grep worker |'
                    " awk '{print $1}' | xargs -I {} kubectl logs {}")
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
+        test = smoke_tests_utils.Test(
             'container_logs_multinode_kubernetes',
             [
                 f'sky launch -y -c {name} {task_yaml} --num-nodes 2',
@@ -895,18 +885,18 @@ def test_container_logs_multinode_kubernetes():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
 def test_container_logs_two_jobs_kubernetes():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
     pod_logs = ('kubectl get pods '
                 f' | grep {name} |  grep head |'
                 " awk '{print $1}' | xargs -I {} kubectl logs {}")
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
+        test = smoke_tests_utils.Test(
             'test_container_logs_two_jobs_kubernetes',
             [
                 f'sky launch -y -c {name} {task_yaml}',
@@ -925,18 +915,18 @@ def test_container_logs_two_jobs_kubernetes():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
 def test_container_logs_two_simultaneous_jobs_kubernetes():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     task_yaml = 'tests/test_yamls/test_k8s_logs.yaml '
     pod_logs = ('kubectl get pods '
                 f' | grep {name} |  grep head |'
                 " awk '{print $1}' | xargs -I {} kubectl logs {}")
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = Test(
+        test = smoke_tests_utils.Test(
             'test_container_logs_two_simultaneous_jobs_kubernetes',
             [
                 f'sky launch -y -c {name}',
@@ -956,7 +946,7 @@ def test_container_logs_two_simultaneous_jobs_kubernetes():
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Task: n=2 nodes with setups. ----------
@@ -967,8 +957,8 @@ def test_container_logs_two_simultaneous_jobs_kubernetes():
     reason=
     'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.')
 def test_distributed_tf(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'resnet_distributed_tf_app',
         [
             # NOTE: running it twice will hang (sometimes?) - an app-level bug.
@@ -978,14 +968,14 @@ def test_distributed_tf(generic_cloud: str):
         f'sky down -y {name}',
         timeout=25 * 60,  # 25 mins (it takes around ~19 mins)
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing GCP start and stop instances ----------
 @pytest.mark.gcp
 def test_gcp_start_stop():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp-start-stop',
         [
             f'sky launch -y -c {name} examples/gcp_start_stop.yaml',
@@ -995,14 +985,14 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[
                     sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT
@@ -1011,14 +1001,14 @@ def test_gcp_start_stop():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing Azure start and stop instances ----------
 @pytest.mark.azure
 def test_azure_start_stop():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'azure-start-stop',
         [
             f'sky launch -y -c {name} examples/azure_start_stop.yaml',
@@ -1030,7 +1020,7 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[
                     sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT
@@ -1041,7 +1031,7 @@ def test_azure_start_stop():
         f'sky down -y {name}',
         timeout=30 * 60,  # 30 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing Autostopping ----------
@@ -1051,14 +1041,14 @@ def test_azure_start_stop():
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 @pytest.mark.no_kubernetes  # Kubernetes does not autostop yet
 def test_autostop(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
     # the VM is stopped.
     autostop_timeout = 600 if generic_cloud == 'azure' else 250
     # Launching and starting Azure clusters can take a long time too. e.g., restart
     # a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
     total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
-    test = Test(
+    test = smoke_tests_utils.Test(
         'autostop',
         [
             f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
@@ -1072,7 +1062,7 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
@@ -1091,7 +1081,7 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
@@ -1103,27 +1093,27 @@ def test_autostop(generic_cloud: str):
             'sleep 45',  # Almost reached the threshold.
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout + BUMP_UP_SECONDS),
+                timeout=autostop_timeout + smoke_tests_utils.BUMP_UP_SECONDS),
         ],
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing Autodowning ----------
 @pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
 def test_autodown(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
     # the VM is terminated.
     autodown_timeout = 900 if generic_cloud == 'azure' else 240
     total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
-    test = Test(
+    test = smoke_tests_utils.Test(
         'autodown',
         [
             f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
@@ -1152,16 +1142,16 @@ def test_autodown(generic_cloud: str):
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.scp
 def test_scp_autodown():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'SCP_autodown',
         [
-            f'sky launch -y -d -c {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
+            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml',
             f'sky autostop -y {name} --down -i 1',
             # Ensure autostop is set.
             f'sky status | grep {name} | grep "1m (down)"',
@@ -1171,14 +1161,14 @@ def test_scp_autodown():
             # Ensure the cluster is terminated.
             'sleep 200',
             f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
+            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml',
             f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
-            f'sky exec {name} {SCP_TYPE} tests/test_yamls/minimal.yaml',
+            f'sky exec {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml',
             f'sky status | grep {name} | grep "1m (down)"',
             'sleep 200',
             # Ensure the cluster is terminated.
             f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {SCP_TYPE} --down tests/test_yamls/minimal.yaml',
+            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml',
             f'sky autostop -y {name} --cancel',
             'sleep 200',
             # Ensure the cluster is still UP.
@@ -1187,11 +1177,11 @@ def test_scp_autodown():
         f'sky down -y {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60):
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'{cloud}-cancel-task',
         [
             f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d',
@@ -1214,23 +1204,23 @@ def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60):
 # ---------- Testing `sky cancel` ----------
 @pytest.mark.aws
 def test_cancel_aws():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     test = _get_cancel_task_with_cloud(name, 'aws')
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_cancel_gcp():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     test = _get_cancel_task_with_cloud(name, 'gcp')
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_cancel_azure():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack does not support V100 gpus for now
@@ -1239,8 +1229,8 @@ def test_cancel_azure():
 @pytest.mark.no_paperspace  # Paperspace has `gnome-shell` on nvidia-smi
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_cancel_pytorch(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'cancel-pytorch',
         [
             f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d',
@@ -1262,15 +1252,15 @@ def test_cancel_pytorch(generic_cloud: str):
         f'sky down -y {name}',
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi`
 # requires a CUDA public image, which IBM doesn't offer
 @pytest.mark.ibm
 def test_cancel_ibm():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'ibm-cancel-task',
         [
             f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml',
@@ -1283,7 +1273,7 @@ def test_cancel_ibm():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing use-spot option ----------
@@ -1295,8 +1285,8 @@ def test_cancel_ibm():
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
 def test_use_spot(generic_cloud: str):
     """Test use-spot and sky exec."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'use-spot',
         [
             f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y',
@@ -1306,14 +1296,14 @@ def test_use_spot(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_stop_gcp_spot():
     """Test GCP spot can be stopped, autostopped, restarted."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'stop_gcp_spot',
         [
             f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile',
@@ -1323,7 +1313,7 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=90),
@@ -1332,21 +1322,21 @@ def test_stop_gcp_spot():
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=120),
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing env ----------
 def test_inline_env(generic_cloud: str):
     """Test env"""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-inline-env',
         [
             f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
@@ -1356,16 +1346,16 @@ def test_inline_env(generic_cloud: str):
             f'sky logs {name} 2 --status',
         ],
         f'sky down -y {name}',
-        get_timeout(generic_cloud),
+        smoke_tests_utils.get_timeout(generic_cloud),
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing env file ----------
 def test_inline_env_file(generic_cloud: str):
     """Test env"""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-inline-env-file',
         [
             f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
@@ -1374,17 +1364,17 @@ def test_inline_env_file(generic_cloud: str):
             f'sky logs {name} 2 --status',
         ],
         f'sky down -y {name}',
-        get_timeout(generic_cloud),
+        smoke_tests_utils.get_timeout(generic_cloud),
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing custom image ----------
 @pytest.mark.aws
 def test_aws_custom_image():
     """Test AWS custom image"""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-aws-custom-image',
         [
             f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a',  # Nvidia image
@@ -1393,7 +1383,7 @@ def test_aws_custom_image():
         f'sky down -y {name}',
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
@@ -1410,8 +1400,8 @@ def test_aws_custom_image():
     ])
 def test_kubernetes_custom_image(image_id):
     """Test Kubernetes custom image"""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-kubernetes-custom-image',
         [
             f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1',
@@ -1424,13 +1414,13 @@ def test_kubernetes_custom_image(image_id):
         f'sky down -y {name}',
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_azure_start_stop_two_nodes():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'azure-start-stop-two-nodes',
         [
             f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml',
@@ -1440,18 +1430,18 @@ def test_azure_start_stop_two_nodes():
             f'sky start -y {name} -i 1',
             f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[
                     sky.ClusterStatus.INIT, sky.ClusterStatus.STOPPED
                 ],
-                timeout=200 + BUMP_UP_SECONDS) +
+                timeout=200 + smoke_tests_utils.BUMP_UP_SECONDS) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
         ],
         f'sky down -y {name}',
         timeout=30 * 60,  # 30 mins  (it takes around ~23 mins)
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing env for disk tier ----------
@@ -1465,11 +1455,11 @@ def _get_aws_query_command(region, instance_id, field, expected):
 
     for disk_tier in list(resources_utils.DiskTier):
         specs = AWS._get_disk_specs(disk_tier)
-        name = get_cluster_name() + '-' + disk_tier.value
+        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
         name_on_cloud = common_utils.make_cluster_name_on_cloud(
             name, sky.AWS.max_cluster_name_length())
         region = 'us-east-2'
-        test = Test(
+        test = smoke_tests_utils.Test(
             'aws-disk-tier-' + disk_tier.value,
             [
                 f'sky launch -y -c {name} --cloud aws --region {region} '
@@ -1488,14 +1478,14 @@ def _get_aws_query_command(region, instance_id, field, expected):
             f'sky down -y {name}',
             timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_disk_tier():
     for disk_tier in list(resources_utils.DiskTier):
         disk_types = [GCP._get_disk_type(disk_tier)]
-        name = get_cluster_name() + '-' + disk_tier.value
+        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
         name_on_cloud = common_utils.make_cluster_name_on_cloud(
             name, sky.GCP.max_cluster_name_length())
         region = 'us-west2'
@@ -1510,7 +1500,7 @@ def test_gcp_disk_tier():
             instance_type_options = ['', '--instance-type n2-standard-64']
         for disk_type, instance_type_option in zip(disk_types,
                                                    instance_type_options):
-            test = Test(
+            test = smoke_tests_utils.Test(
                 'gcp-disk-tier-' + disk_tier.value,
                 [
                     f'sky launch -y -c {name} --cloud gcp --region {region} '
@@ -1524,7 +1514,7 @@ def test_gcp_disk_tier():
                 f'sky down -y {name}',
                 timeout=6 * 60,  # 6 mins  (it takes around ~3 mins)
             )
-            run_one_test(test)
+            smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
@@ -1534,11 +1524,11 @@ def test_azure_disk_tier():
             # Azure does not support high and ultra disk tier.
             continue
         type = Azure._get_disk_type(disk_tier)
-        name = get_cluster_name() + '-' + disk_tier.value
+        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
         name_on_cloud = common_utils.make_cluster_name_on_cloud(
             name, sky.Azure.max_cluster_name_length())
         region = 'westus2'
-        test = Test(
+        test = smoke_tests_utils.Test(
             'azure-disk-tier-' + disk_tier.value,
             [
                 f'sky launch -y -c {name} --cloud azure --region {region} '
@@ -1550,17 +1540,17 @@ def test_azure_disk_tier():
             f'sky down -y {name}',
             timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_azure_best_tier_failover():
     type = Azure._get_disk_type(resources_utils.DiskTier.LOW)
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, sky.Azure.max_cluster_name_length())
     region = 'westus2'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'azure-best-tier-failover',
         [
             f'sky launch -y -c {name} --cloud azure --region {region} '
@@ -1572,15 +1562,15 @@ def test_azure_best_tier_failover():
         f'sky down -y {name}',
         timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ------ Testing Zero Quota Failover ------
 @pytest.mark.aws
 def test_aws_zero_quota_failover():
 
-    name = get_cluster_name()
-    region = get_aws_region_for_quota_failover()
+    name = smoke_tests_utils.get_cluster_name()
+    region = smoke_tests_utils.get_aws_region_for_quota_failover()
 
     if not region:
         pytest.xfail(
@@ -1589,21 +1579,21 @@ def test_aws_zero_quota_failover():
             'expected for your account?')
         return
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         'aws-zero-quota-failover',
         [
             f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"',
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_zero_quota_failover():
 
-    name = get_cluster_name()
-    region = get_gcp_region_for_quota_failover()
+    name = smoke_tests_utils.get_cluster_name()
+    region = smoke_tests_utils.get_gcp_region_for_quota_failover()
 
     if not region:
         pytest.xfail(
@@ -1612,18 +1602,18 @@ def test_gcp_zero_quota_failover():
             'expected for your account?')
         return
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         'gcp-zero-quota-failover',
         [
             f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"',
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 def test_long_setup_run_script(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
                                      suffix='.yaml') as f:
         f.write(
@@ -1644,7 +1634,7 @@ def test_long_setup_run_script(generic_cloud: str):
         f.write('  echo "end run"\n')
         f.flush()
 
-        test = Test(
+        test = smoke_tests_utils.Test(
             'long-setup-run-script',
             [
                 f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}',
@@ -1656,4 +1646,4 @@ def test_long_setup_run_script(generic_cloud: str):
             ],
             f'sky down -y {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
index 4fa39d98177..27d6a693ae6 100644
--- a/tests/smoke_tests/test_images.py
+++ b/tests/smoke_tests/test_images.py
@@ -20,11 +20,7 @@
 # > pytest tests/smoke_tests/test_images.py --generic-cloud aws
 
 import pytest
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import get_cmd_wait_until_cluster_is_not_found
-from smoke_tests.util import get_cmd_wait_until_cluster_status_contains
-from smoke_tests.util import run_one_test
-from smoke_tests.util import Test
+from smoke_tests import smoke_tests_utils
 
 import sky
 
@@ -32,8 +28,8 @@
 # ---------- Test the image ----------
 @pytest.mark.aws
 def test_aws_images():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_images',
         [
             f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
@@ -47,13 +43,13 @@ def test_aws_images():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_images():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_images',
         [
             f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml',
@@ -67,13 +63,13 @@ def test_gcp_images():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_azure_images():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'azure_images',
         [
             f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml',
@@ -87,13 +83,13 @@ def test_azure_images():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_aws_image_id_dict():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_image_id_dict',
         [
             # Use image id dict.
@@ -106,13 +102,13 @@ def test_aws_image_id_dict():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_image_id_dict',
         [
             # Use image id dict.
@@ -125,13 +121,13 @@ def test_gcp_image_id_dict():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_aws_image_id_dict_region():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_image_id_dict_region',
         [
             # YAML has
@@ -162,13 +158,13 @@ def test_aws_image_id_dict_region():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict_region():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_image_id_dict_region',
         [
             # Use region to filter image_id dict.
@@ -195,13 +191,13 @@ def test_gcp_image_id_dict_region():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_aws_image_id_dict_zone():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_image_id_dict_zone',
         [
             # YAML has
@@ -233,13 +229,13 @@ def test_aws_image_id_dict_zone():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_image_id_dict_zone():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_image_id_dict_zone',
         [
             # Use zone to filter image_id dict.
@@ -267,19 +263,19 @@ def test_gcp_image_id_dict_zone():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_clone_disk_aws():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'clone_disk_aws',
         [
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
-            get_cmd_wait_until_cluster_status_contains(
+            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
                 cluster_status=[sky.ClusterStatus.STOPPED],
                 timeout=60),
@@ -294,13 +290,13 @@ def test_clone_disk_aws():
         f'sky down -y {name} {name}-clone {name}-clone-2',
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_clone_disk_gcp():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'clone_disk_gcp',
         [
             f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"',
@@ -313,14 +309,14 @@ def test_clone_disk_gcp():
         ],
         f'sky down -y {name} {name}-clone {name}-clone-2',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_mig():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     region = 'us-central1'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'gcp_mig',
         [
             f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml',
@@ -331,8 +327,8 @@ def test_gcp_mig():
             # Check MIG exists.
             f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
             f'sky autostop -i 0 --down -y {name}',
-            get_cmd_wait_until_cluster_is_not_found(cluster_name=name,
-                                                    timeout=120),
+            smoke_tests_utils.get_cmd_wait_until_cluster_is_not_found(
+                cluster_name=name, timeout=120),
             f'gcloud compute instance-templates list | grep "sky-it-{name}"',
             # Launch again with the same region. The original instance template
             # should be removed.
@@ -343,12 +339,12 @@ def test_gcp_mig():
         ],
         f'sky down -y {name}',
         env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'})
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_force_enable_external_ips():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     test_commands = [
         f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
         # Check network of vm is "default"
@@ -361,17 +357,17 @@ def test_gcp_force_enable_external_ips():
         f'sky down -y {name}',
     ]
     skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml'
-    test = Test('gcp_force_enable_external_ips',
-                test_commands,
-                f'sky down -y {name}',
-                env={'SKYPILOT_CONFIG': skypilot_config})
-    run_one_test(test)
+    test = smoke_tests_utils.Test('gcp_force_enable_external_ips',
+                                  test_commands,
+                                  f'sky down -y {name}',
+                                  env={'SKYPILOT_CONFIG': skypilot_config})
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_image_no_conda():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'image_no_conda',
         [
             # Use image id dict.
@@ -384,14 +380,14 @@ def test_image_no_conda():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
 @pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
 def test_custom_default_conda_env(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test('custom_default_conda_env', [
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test('custom_default_conda_env', [
         f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky status -r {name} | grep "UP"',
         f'sky logs {name} 1 --status',
@@ -399,7 +395,7 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        get_cmd_wait_until_cluster_status_contains(
+        smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
             cluster_name=name,
             cluster_status=[sky.ClusterStatus.STOPPED],
             timeout=80),
@@ -408,4 +404,4 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 3 --status',
     ], f'sky down -y {name}')
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index 4d6f1dd9614..c8ef5c1a502 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -27,20 +27,12 @@
 import time
 
 import pytest
+from smoke_tests import smoke_tests_utils
 from smoke_tests.test_mount_and_storage import TestStorageWithCredentials
-from smoke_tests.util import BUMP_UP_SECONDS
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import (
-    get_cmd_wait_until_managed_job_status_contains_matching_job_name)
-from smoke_tests.util import GET_JOB_QUEUE
-from smoke_tests.util import JOB_WAIT_NOT_RUNNING
-from smoke_tests.util import run_one_test
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
-from smoke_tests.util import Test
 
+import sky
 from sky import jobs
 from sky.data import storage as storage_lib
-from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
 from sky.utils import common_utils
 
@@ -52,35 +44,40 @@
 @pytest.mark.managed_jobs
 def test_managed_jobs(generic_cloud: str):
     """Test the managed jobs yaml."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'managed-jobs',
         [
             f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
                 job_status=[
-                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
-                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                    sky.ManagedJobStatus.PENDING,
+                    sky.ManagedJobStatus.SUBMITTED,
+                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
                 ],
                 timeout=60),
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
                 job_status=[
-                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
-                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                    sky.ManagedJobStatus.PENDING,
+                    sky.ManagedJobStatus.SUBMITTED,
+                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
                 ],
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
-                job_status=[ManagedJobStatus.CANCELLED],
+                job_status=[sky.ManagedJobStatus.CANCELLED],
                 timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
             f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
-            f'{GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
         ],
         # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when
         # canceling multiple job names is supported.
@@ -88,7 +85,7 @@ def test_managed_jobs(generic_cloud: str):
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  #fluidstack does not support spot instances
@@ -100,36 +97,36 @@ def test_managed_jobs(generic_cloud: str):
 @pytest.mark.managed_jobs
 def test_job_pipeline(generic_cloud: str):
     """Test a job pipeline."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'spot-pipeline',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
             'sleep 5',
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
             # `grep -A 4 {name}` finds the job with {name} and the 4 lines
             # after it, i.e. the 4 tasks within the job.
             # `sed -n 2p` gets the second line of the 4 lines, i.e. the first
             # task within the job.
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
             f'sky jobs cancel -y -n {name}',
             'sleep 5',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
             'sleep 200',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  #fluidstack does not support spot instances
@@ -141,22 +138,23 @@ def test_job_pipeline(generic_cloud: str):
 @pytest.mark.managed_jobs
 def test_managed_jobs_failed_setup(generic_cloud: str):
     """Test managed job with failed setup."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'managed_jobs_failed_setup',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
             # Make sure the job failed quickly.
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.FAILED_SETUP],
-                timeout=330 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.FAILED_SETUP],
+                timeout=330 + smoke_tests_utils.BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  #fluidstack does not support spot instances
@@ -168,31 +166,32 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
 @pytest.mark.managed_jobs
 def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
     """Test managed job with failed setup for a pipeline."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'managed_jobs_pipeline_failed_setup',
         [
             f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.FAILED_SETUP],
+                job_status=[sky.ManagedJobStatus.FAILED_SETUP],
                 timeout=600),
             # Make sure the job failed quickly.
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
             # Task 0 should be SUCCEEDED.
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
             # Task 1 should be FAILED_SETUP.
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
             # Task 2 should be CANCELLED.
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
             # Task 3 should be CANCELLED.
-            f'{GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing managed job recovery ----------
@@ -202,17 +201,18 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
 @pytest.mark.managed_jobs
 def test_managed_jobs_recovery_aws(aws_config_region):
     """Test managed job recovery."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_recovery_aws',
         [
             f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=600),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
@@ -221,25 +221,26 @@ def test_managed_jobs_recovery_aws(aws_config_region):
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 @pytest.mark.managed_jobs
 def test_managed_jobs_recovery_gcp():
     """Test managed job recovery."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     zone = 'us-east4-b'
@@ -250,48 +251,51 @@ def test_managed_jobs_recovery_gcp():
         f'--zones={zone} --format="value(name)"')
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_recovery_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=300),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             terminate_cmd,
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 @pytest.mark.managed_jobs
 def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
     """Test managed job recovery for a pipeline."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     user_hash = common_utils.get_user_hash()
     user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
     region = aws_config_region
     if region != 'us-east-2':
         pytest.skip('Only run spot pipeline recovery test in us-east-2')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_pipeline_recovery_aws',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
@@ -309,11 +313,12 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
                 f'-{user_hash} '
                 f'--query Reservations[].Instances[].InstanceId '
                 '--output text)'),
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
@@ -323,14 +328,14 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 @pytest.mark.managed_jobs
 def test_managed_jobs_pipeline_recovery_gcp():
     """Test managed job recovery for a pipeline."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     zone = 'us-east4-b'
     user_hash = common_utils.get_user_hash()
     user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
@@ -340,13 +345,14 @@ def test_managed_jobs_pipeline_recovery_gcp():
         f'--zones={zone} --format="value(name)"')
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_pipeline_recovery_gcp',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
@@ -356,11 +362,12 @@ def test_managed_jobs_pipeline_recovery_gcp():
             # separated by `-`.
             (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
              f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
@@ -370,7 +377,7 @@ def test_managed_jobs_pipeline_recovery_gcp():
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
@@ -382,39 +389,42 @@ def test_managed_jobs_pipeline_recovery_gcp():
 @pytest.mark.managed_jobs
 def test_managed_jobs_recovery_default_resources(generic_cloud: str):
     """Test managed job recovery for default resources."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'managed-spot-recovery-default-resources',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[
-                    ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING
+                    sky.ManagedJobStatus.RUNNING,
+                    sky.ManagedJobStatus.RECOVERING
                 ],
                 timeout=360),
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 @pytest.mark.managed_jobs
 def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
     """Test managed job recovery."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_recovery_multi_node_aws',
         [
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=450),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
@@ -424,25 +434,26 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
              'Name=tag:ray-node-type,Values=worker '
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 @pytest.mark.managed_jobs
 def test_managed_jobs_recovery_multi_node_gcp():
     """Test managed job recovery."""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     zone = 'us-west2-a'
@@ -453,35 +464,37 @@ def test_managed_jobs_recovery_multi_node_gcp():
         f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"')
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_recovery_multi_node_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             terminate_cmd,
-            JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.RUNNING],
+                job_status=[sky.ManagedJobStatus.RUNNING],
                 timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 @pytest.mark.managed_jobs
 def test_managed_jobs_cancellation_aws(aws_config_region):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
         name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     name_2_on_cloud = common_utils.make_cluster_name_on_cloud(
@@ -489,22 +502,24 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
     name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
         f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_cancellation_aws',
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[
-                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
                 ],
-                timeout=60 + BUMP_UP_SECONDS),
+                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -513,15 +528,17 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             # The job is set up in the cluster, will shown as RUNNING.
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.RUNNING],
+                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -530,23 +547,25 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
             # The job is running in the cluster, will shown as RUNNING.
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.RUNNING],
+                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
-            JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$(aws ec2 describe-instances --region {region} '
@@ -556,13 +575,13 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             ),
         ],
         timeout=25 * 60)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 @pytest.mark.managed_jobs
 def test_managed_jobs_cancellation_gcp():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     name_3 = f'{name}-3'
     name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
         name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
@@ -576,54 +595,60 @@ def test_managed_jobs_cancellation_gcp():
                  f'--zones={zone} --format="value(name)"')
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
-    test = Test(
+    test = smoke_tests_utils.Test(
         'managed_jobs_cancellation_gcp',
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.STARTING],
-                timeout=60 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.STARTING],
+                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             # The job is set up in the cluster, will shown as RUNNING.
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.RUNNING],
+                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=[ManagedJobStatus.RUNNING],
-                timeout=300 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.RUNNING],
+                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             terminate_cmd,
-            JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
+            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
+            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-3',
-                job_status=[ManagedJobStatus.CANCELLED],
-                timeout=120 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.CANCELLED],
+                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
             ),
         ],
         timeout=25 * 60)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing storage for managed job ----------
@@ -635,7 +660,7 @@ def test_managed_jobs_cancellation_gcp():
 @pytest.mark.managed_jobs
 def test_managed_jobs_storage(generic_cloud: str):
     """Test storage with managed job"""
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     yaml_str = pathlib.Path(
         'examples/managed_job_with_storage.yaml').read_text()
     timestamp = int(time.time())
@@ -700,16 +725,17 @@ def test_managed_jobs_storage(generic_cloud: str):
         f.write(yaml_str)
         f.flush()
         file_path = f.name
-        test = Test(
+        test = smoke_tests_utils.Test(
             'managed_jobs_storage',
             [
-                *STORAGE_SETUP_COMMANDS,
+                *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
                 f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
                 region_validation_cmd,  # Check if the bucket is created in the correct region
+                smoke_tests_utils.
                 get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                     job_name=name,
-                    job_status=[ManagedJobStatus.SUCCEEDED],
-                    timeout=60 + BUMP_UP_SECONDS),
+                    job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                    timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
                 # Wait for the job to be cleaned up.
                 'sleep 20',
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
@@ -721,7 +747,7 @@ def test_managed_jobs_storage(generic_cloud: str):
             # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
             timeout=20 * 60,
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing spot TPU ----------
@@ -730,43 +756,46 @@ def test_managed_jobs_storage(generic_cloud: str):
 @pytest.mark.tpu
 def test_managed_jobs_tpu():
     """Test managed job on TPU."""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-spot-tpu',
         [
             f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.STARTING],
-                timeout=60 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.STARTING],
+                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
             # TPU takes a while to launch
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
                 job_status=[
-                    ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED
+                    sky.ManagedJobStatus.RUNNING, sky.ManagedJobStatus.SUCCEEDED
                 ],
-                timeout=900 + BUMP_UP_SECONDS),
+                timeout=900 + smoke_tests_utils.BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing env for managed jobs ----------
 @pytest.mark.managed_jobs
 def test_managed_jobs_inline_env(generic_cloud: str):
     """Test managed jobs env"""
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test-managed-jobs-inline-env',
         [
             f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\\$TEST_ENV"; ([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
+            smoke_tests_utils.
             get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=[ManagedJobStatus.SUCCEEDED],
-                timeout=20 + BUMP_UP_SECONDS),
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=20 + smoke_tests_utils.BUMP_UP_SECONDS),
             f'JOB_ROW=$(sky jobs queue | grep {name} | head -n1) && '
             f'echo "$JOB_ROW" && echo "$JOB_ROW" | grep "SUCCEEDED" && '
             f'JOB_ID=$(echo "$JOB_ROW" | awk \'{{print $1}}\') && '
@@ -780,4 +809,4 @@ def test_managed_jobs_inline_env(generic_cloud: str):
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
index 6a2f0944fec..4889bdcc85e 100644
--- a/tests/smoke_tests/test_mount_and_storage.py
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -32,12 +32,7 @@
 
 import jinja2
 import pytest
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import get_timeout
-from smoke_tests.util import run_one_test
-from smoke_tests.util import SCP_TYPE
-from smoke_tests.util import STORAGE_SETUP_COMMANDS
-from smoke_tests.util import Test
+from smoke_tests import smoke_tests_utils
 
 import sky
 from sky import global_user_state
@@ -52,7 +47,7 @@
 # ---------- file_mounts ----------
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead.
 def test_file_mounts(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     extra_flags = ''
     if generic_cloud in 'kubernetes':
         # Kubernetes does not support multi-node
@@ -60,42 +55,42 @@ def test_file_mounts(generic_cloud: str):
         #  arm64 (e.g., Apple Silicon) since goofys does not work on arm64.
         extra_flags = '--num-nodes 1'
     test_commands = [
-        *STORAGE_SETUP_COMMANDS,
+        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
         f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml',
         f'sky logs {name} 1 --status',  # Ensure the job succeeded.
     ]
-    test = Test(
+    test = smoke_tests_utils.Test(
         'using_file_mounts',
         test_commands,
         f'sky down -y {name}',
-        get_timeout(generic_cloud, 20 * 60),  # 20 mins
+        smoke_tests_utils.get_timeout(generic_cloud, 20 * 60),  # 20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.scp
 def test_scp_file_mounts():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     test_commands = [
-        *STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {name} {SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml',
+        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
+        f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml',
         f'sky logs {name} 1 --status',  # Ensure the job succeeded.
     ]
-    test = Test(
+    test = smoke_tests_utils.Test(
         'SCP_using_file_mounts',
         test_commands,
         f'sky down -y {name}',
         timeout=20 * 60,  # 20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Requires GCP to be enabled
 def test_using_file_mounts_with_env_vars(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     storage_name = TestStorageWithCredentials.generate_bucket_name()
     test_commands = [
-        *STORAGE_SETUP_COMMANDS,
+        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
         (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} '
          'examples/using_file_mounts_with_env_vars.yaml '
          f'--env MY_BUCKET={storage_name}'),
@@ -107,20 +102,20 @@ def test_using_file_mounts_with_env_vars(generic_cloud: str):
          '--env MY_LOCAL_PATH=tmpfile'),
         f'sky logs {name}-2 1 --status',  # Ensure the job succeeded.
     ]
-    test = Test(
+    test = smoke_tests_utils.Test(
         'using_file_mounts_with_env_vars',
         test_commands,
         (f'sky down -y {name} {name}-2',
          f'sky storage delete -y {storage_name} {storage_name}-2'),
         timeout=20 * 60,  # 20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- storage ----------
 @pytest.mark.aws
 def test_aws_storage_mounts_with_stop():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     cloud = 'aws'
     storage_name = f'sky-test-{int(time.time())}'
     template_str = pathlib.Path(
@@ -132,7 +127,7 @@ def test_aws_storage_mounts_with_stop():
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud {cloud} {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'aws s3 ls {storage_name}/hello.txt',
@@ -142,18 +137,18 @@ def test_aws_storage_mounts_with_stop():
             # the mounted directory
             f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'aws_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_storage_mounts_with_stop():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     cloud = 'gcp'
     storage_name = f'sky-test-{int(time.time())}'
     template_str = pathlib.Path(
@@ -165,7 +160,7 @@ def test_gcp_storage_mounts_with_stop():
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud {cloud} {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'gsutil ls gs://{storage_name}/hello.txt',
@@ -175,18 +170,18 @@ def test_gcp_storage_mounts_with_stop():
             # the mounted directory
             f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'gcp_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_azure_storage_mounts_with_stop():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     cloud = 'azure'
     storage_name = f'sky-test-{int(time.time())}'
     default_region = 'eastus'
@@ -203,7 +198,7 @@ def test_azure_storage_mounts_with_stop():
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud {cloud} {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)'
@@ -215,13 +210,13 @@ def test_azure_storage_mounts_with_stop():
             # the mounted directory
             f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'azure_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
@@ -229,7 +224,7 @@ def test_kubernetes_storage_mounts():
     # Tests bucket mounting on k8s, assuming S3 is configured.
     # This test will fail if run on non x86_64 architecture, since goofys is
     # built for x86_64 only.
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     storage_name = f'sky-test-{int(time.time())}'
     template_str = pathlib.Path(
         'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
@@ -240,24 +235,24 @@ def test_kubernetes_storage_mounts():
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud kubernetes {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'aws s3 ls {storage_name}/hello.txt || '
             f'gsutil ls gs://{storage_name}/hello.txt',
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'kubernetes_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
 def test_kubernetes_context_switch():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     new_context = f'sky-test-context-{int(time.time())}'
     new_namespace = f'sky-test-namespace-{int(time.time())}'
 
@@ -301,13 +296,13 @@ def test_kubernetes_context_switch():
         'rm /tmp/sky_test_current_context; '
         f'sky down -y {name}')
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         'kubernetes_context_switch',
         test_commands,
         cleanup_commands,
         timeout=20 * 60,  # 20 mins
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.parametrize(
@@ -323,7 +318,7 @@ def test_kubernetes_context_switch():
     ])
 def test_docker_storage_mounts(generic_cloud: str, image_id: str):
     # Tests bucket mounting on docker container
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     timestamp = str(time.time()).replace('.', '')
     storage_name = f'sky-test-{timestamp}'
     template_str = pathlib.Path(
@@ -354,7 +349,7 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str):
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             # Check AWS, GCP, or Azure storage mount.
@@ -362,18 +357,18 @@ def test_docker_storage_mounts(generic_cloud: str, image_id: str):
             f'{gsutil_command} || '
             f'{azure_blob_command}',
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'docker_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.cloudflare
 def test_cloudflare_storage_mounts(generic_cloud: str):
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     storage_name = f'sky-test-{int(time.time())}'
     template_str = pathlib.Path(
         'tests/test_yamls/test_r2_storage_mounting.yaml').read_text()
@@ -385,24 +380,24 @@ def test_cloudflare_storage_mounts(generic_cloud: str):
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2'
         ]
 
-        test = Test(
+        test = smoke_tests_utils.Test(
             'cloudflare_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.ibm
 def test_ibm_storage_mounts():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     storage_name = f'sky-test-{int(time.time())}'
     bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
         storage_name, Rclone.RcloneClouds.IBM)
@@ -415,18 +410,18 @@ def test_ibm_storage_mounts():
         f.flush()
         file_path = f.name
         test_commands = [
-            *STORAGE_SETUP_COMMANDS,
+            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
             f'sky launch -y -c {name} --cloud ibm {file_path}',
             f'sky logs {name} 1 --status',  # Ensure job succeeded.
             f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt',
         ]
-        test = Test(
+        test = smoke_tests_utils.Test(
             'ibm_storage_mounts',
             test_commands,
             f'sky down -y {name}; sky storage delete -y {storage_name}',
             timeout=20 * 60,  # 20 mins
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Testing Storage ----------
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index e2e58bb3c62..706cd3bb64a 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -23,12 +23,7 @@
 import textwrap
 
 import pytest
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import get_cmd_wait_until_cluster_status_contains_wildcard
-from smoke_tests.util import (
-    get_cmd_wait_until_managed_job_status_contains_matching_job_name)
-from smoke_tests.util import run_one_test
-from smoke_tests.util import Test
+from smoke_tests import smoke_tests_utils
 
 import sky
 from sky.skylet import constants
@@ -37,8 +32,8 @@
 # ---------- Test region ----------
 @pytest.mark.aws
 def test_aws_region():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_region',
         [
             f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml',
@@ -53,12 +48,12 @@ def test_aws_region():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
 def test_aws_with_ssh_proxy_command():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
 
     with tempfile.NamedTemporaryFile(mode='w') as f:
         f.write(
@@ -67,7 +62,7 @@ def test_aws_with_ssh_proxy_command():
             ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name}
         """))
         f.flush()
-        test = Test(
+        test = smoke_tests_utils.Test(
             'aws_with_ssh_proxy_command',
             [
                 f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1',
@@ -81,11 +76,13 @@ def test_aws_with_ssh_proxy_command():
                 f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
                 # Wait other tests to create the job controller first, so that
                 # the job controller is not launched with proxy command.
+                smoke_tests_utils.
                 get_cmd_wait_until_cluster_status_contains_wildcard(
                     cluster_name_wildcard='sky-jobs-controller-*',
                     cluster_status=[sky.ClusterStatus.UP],
                     timeout=300),
                 f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
+                smoke_tests_utils.
                 get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                     job_name=name,
                     job_status=[
@@ -97,13 +94,13 @@ def test_aws_with_ssh_proxy_command():
             ],
             f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
         )
-        run_one_test(test)
+        smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_region_and_service_account():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_region',
         [
             f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml',
@@ -120,14 +117,14 @@ def test_gcp_region_and_service_account():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.ibm
 def test_ibm_region():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     region = 'eu-de'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'region',
         [
             f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml',
@@ -137,13 +134,13 @@ def test_ibm_region():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
 def test_azure_region():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'azure_region',
         [
             f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml',
@@ -160,14 +157,14 @@ def test_azure_region():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # ---------- Test zone ----------
 @pytest.mark.aws
 def test_aws_zone():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'aws_zone',
         [
             f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b',
@@ -177,14 +174,14 @@ def test_aws_zone():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.ibm
 def test_ibm_zone():
-    name = get_cluster_name()
+    name = smoke_tests_utils.get_cluster_name()
     zone = 'eu-de-2'
-    test = Test(
+    test = smoke_tests_utils.Test(
         'zone',
         [
             f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}',
@@ -194,13 +191,13 @@ def test_ibm_zone():
         ],
         f'sky down -y {name} {name}-2 {name}-3',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
 def test_gcp_zone():
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'gcp_zone',
         [
             f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
@@ -210,4 +207,4 @@ def test_gcp_zone():
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_required_before_merge.py
index ffaf75e7cbc..1d68b8a81e6 100644
--- a/tests/smoke_tests/test_required_before_merge.py
+++ b/tests/smoke_tests/test_required_before_merge.py
@@ -19,21 +19,18 @@
 # Change cloud for generic tests to aws
 # > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws
 
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import (
-    get_cmd_wait_until_job_status_contains_matching_job_id)
-from smoke_tests.util import run_one_test
-from smoke_tests.util import Test
+from smoke_tests import smoke_tests_utils
 
 import sky
 
 
 def test_yaml_launch_and_mount(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'test_yaml_launch_and_mount',
         [
             f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml',
+            smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
                 job_id=1,
@@ -43,4 +40,4 @@ def test_yaml_launch_and_mount(generic_cloud: str):
         f'sky down -y {name}',
         timeout=5 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py
index f56d9bb96ee..5f34eba8728 100644
--- a/tests/smoke_tests/test_sky_serve.py
+++ b/tests/smoke_tests/test_sky_serve.py
@@ -28,11 +28,7 @@
 from typing import List, Tuple
 
 import pytest
-from smoke_tests.util import get_cluster_name
-from smoke_tests.util import run_one_test
-from smoke_tests.util import terminate_gcp_replica
-from smoke_tests.util import Test
-from smoke_tests.util import test_id
+from smoke_tests import smoke_tests_utils
 
 from sky import serve
 from sky.utils import common_utils
@@ -49,7 +45,7 @@ def _get_service_name() -> str:
     test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
     test_name = test_name.replace('skyserve-', 'ss-')
     test_name = common_utils.make_cluster_name_on_cloud(test_name, 24)
-    return f'{test_name}-{test_id}'
+    return f'{test_name}-{smoke_tests_utils.test_id}'
 
 
 # We check the output of the skyserve service to see if it is ready. Output of
@@ -107,8 +103,8 @@ def _get_replica_ip(name: str, replica_id: int) -> str:
 
 
 def _get_skyserve_http_test(name: str, cloud: str,
-                            timeout_minutes: int) -> Test:
-    test = Test(
+                            timeout_minutes: int) -> smoke_tests_utils.Test:
+    test = smoke_tests_utils.Test(
         f'test-skyserve-{cloud.replace("_", "-")}',
         [
             f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml',
@@ -161,7 +157,7 @@ def test_skyserve_gcp_http():
     """Test skyserve on GCP"""
     name = _get_service_name()
     test = _get_skyserve_http_test(name, 'gcp', 20)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.aws
@@ -170,7 +166,7 @@ def test_skyserve_aws_http():
     """Test skyserve on AWS"""
     name = _get_service_name()
     test = _get_skyserve_http_test(name, 'aws', 20)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.azure
@@ -179,7 +175,7 @@ def test_skyserve_azure_http():
     """Test skyserve on Azure"""
     name = _get_service_name()
     test = _get_skyserve_http_test(name, 'azure', 30)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.kubernetes
@@ -188,7 +184,7 @@ def test_skyserve_kubernetes_http():
     """Test skyserve on Kubernetes"""
     name = _get_service_name()
     test = _get_skyserve_http_test(name, 'kubernetes', 30)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.oci
@@ -197,7 +193,7 @@ def test_skyserve_oci_http():
     """Test skyserve on OCI"""
     name = _get_service_name()
     test = _get_skyserve_http_test(name, 'oci', 20)
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack does not support T4 gpus for now
@@ -218,7 +214,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str:
               encoding='utf-8') as f:
         prompt2output = json.load(f)
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-llm',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml',
@@ -231,7 +227,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str:
         _TEARDOWN_SERVICE.format(name=name),
         timeout=40 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
@@ -240,14 +236,14 @@ def test_skyserve_spot_recovery():
     name = _get_service_name()
     zone = 'us-central1-a'
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-spot-recovery-gcp',
         [
             f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml',
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
             'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-            terminate_gcp_replica(name, zone, 1),
+            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
             'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
@@ -255,7 +251,7 @@ def test_skyserve_spot_recovery():
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
@@ -263,7 +259,7 @@ def test_skyserve_spot_recovery():
 @pytest.mark.no_kubernetes
 def test_skyserve_base_ondemand_fallback(generic_cloud: str):
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-base-ondemand-fallback',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml',
@@ -274,7 +270,7 @@ def test_skyserve_base_ondemand_fallback(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
@@ -283,7 +279,7 @@ def test_skyserve_dynamic_ondemand_fallback():
     name = _get_service_name()
     zone = 'us-central1-a'
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-dynamic-ondemand-fallback',
         [
             f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml',
@@ -302,7 +298,7 @@ def test_skyserve_dynamic_ondemand_fallback():
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
             _check_replica_in_status(name, [(2, True, 'READY'),
                                             (0, False, '')]),
-            terminate_gcp_replica(name, zone, 1),
+            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
             f'sleep 40',
             # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning).
             f'{_SERVE_STATUS_WAIT.format(name=name)}; '
@@ -320,7 +316,7 @@ def test_skyserve_dynamic_ondemand_fallback():
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
@@ -330,7 +326,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str):
     """Tests that we restart the service after user bug."""
     # TODO(zhwu): this behavior needs some rethinking.
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-user-bug-restart',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml',
@@ -355,7 +351,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
@@ -363,7 +359,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str):
 def test_skyserve_load_balancer(generic_cloud: str):
     """Test skyserve load balancer round-robin policy"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-load-balancer',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml',
@@ -378,7 +374,7 @@ def test_skyserve_load_balancer(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.gcp
@@ -388,7 +384,7 @@ def test_skyserve_auto_restart():
     """Test skyserve with auto restart"""
     name = _get_service_name()
     zone = 'us-central1-a'
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-auto-restart',
         [
             # TODO(tian): we can dynamically generate YAML from template to
@@ -400,7 +396,7 @@ def test_skyserve_auto_restart():
             # sleep for 20 seconds (initial delay) to make sure it will
             # be restarted
             f'sleep 20',
-            terminate_gcp_replica(name, zone, 1),
+            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
             # Wait for consecutive failure timeout passed.
             # If the cluster is not using spot, it won't check the cluster status
             # on the cloud (since manual shutdown is not a common behavior and such
@@ -421,7 +417,7 @@ def test_skyserve_auto_restart():
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
@@ -429,7 +425,7 @@ def test_skyserve_cancel(generic_cloud: str):
     """Test skyserve with cancel"""
     name = _get_service_name()
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-cancel',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml',
@@ -446,14 +442,14 @@ def test_skyserve_cancel(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
 def test_skyserve_streaming(generic_cloud: str):
     """Test skyserve with streaming"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-streaming',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml',
@@ -465,14 +461,14 @@ def test_skyserve_streaming(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
 def test_skyserve_readiness_timeout_fail(generic_cloud: str):
     """Test skyserve with large readiness probe latency, expected to fail"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-readiness-timeout-fail',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml',
@@ -488,14 +484,14 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
 def test_skyserve_large_readiness_timeout(generic_cloud: str):
     """Test skyserve with customized large readiness timeout"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-large-readiness-timeout',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml',
@@ -506,7 +502,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
@@ -515,7 +511,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str):
 def test_skyserve_update(generic_cloud: str):
     """Test skyserve with update"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-update',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
@@ -536,7 +532,7 @@ def test_skyserve_update(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
@@ -548,7 +544,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
     single_new_replica = _check_replica_in_status(
         name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
                (1, False, 'SHUTTING_DOWN')])
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-rolling-update',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
@@ -574,7 +570,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack
@@ -583,7 +579,7 @@ def test_skyserve_fast_update(generic_cloud: str):
     """Test skyserve with fast update (Increment version of old replicas)"""
     name = _get_service_name()
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-fast-update',
         [
             f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml',
@@ -616,14 +612,14 @@ def test_skyserve_fast_update(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.serve
 def test_skyserve_update_autoscale(generic_cloud: str):
     """Test skyserve update with autoscale"""
     name = _get_service_name()
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-update-autoscale',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
@@ -652,7 +648,7 @@ def test_skyserve_update_autoscale(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=30 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 @pytest.mark.no_fluidstack  # Spot instances are note supported by Fluidstack
@@ -690,7 +686,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
                        (2, False, 'READY')]) +
             _check_service_version(name, "1"),
         ]
-    test = Test(
+    test = smoke_tests_utils.Test(
         f'test-skyserve-new-autoscaler-update-{mode}',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml',
@@ -716,7 +712,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
@@ -726,7 +722,7 @@ def test_skyserve_failures(generic_cloud: str):
     """Test replica failure statuses"""
     name = _get_service_name()
 
-    test = Test(
+    test = smoke_tests_utils.Test(
         'test-skyserve-failures',
         [
             f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml',
@@ -764,7 +760,7 @@ def test_skyserve_failures(generic_cloud: str):
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)
 
 
 # TODO(Ziming, Tian): Add tests for autoscaling.
@@ -772,8 +768,8 @@ def test_skyserve_failures(generic_cloud: str):
 
 # ------- Testing user dependencies --------
 def test_user_dependencies(generic_cloud: str):
-    name = get_cluster_name()
-    test = Test(
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
         'user-dependencies',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"',
@@ -792,4 +788,4 @@ def test_user_dependencies(generic_cloud: str):
         ],
         f'sky down -y {name}',
     )
-    run_one_test(test)
+    smoke_tests_utils.run_one_test(test)

From 7db0579cc5dba21007b68404bc35da01e60358c6 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 5 Dec 2024 14:25:17 +0800
Subject: [PATCH 55/64] generate all cloud

---
 .buildkite/generate_pipeline.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index f2edae5dfca..d3070ab91f8 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -141,8 +141,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
     return function_cloud_map
 
 
-def _generate_pipeline(test_file: str,
-                       one_cloud_per_test_function: bool) -> Dict[str, Any]:
+def _generate_pipeline(test_file: str) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
     function_cloud_map = _extract_marked_tests(test_file)
@@ -160,8 +159,6 @@ def _generate_pipeline(test_file: str,
                 'if': f'build.env("{cloud}") == "1"'
             }
             steps.append(step)
-            if one_cloud_per_test_function:
-                break
     return {'steps': steps}
 
 
@@ -191,8 +188,7 @@ def _convert_release(test_files: List[str]):
     output_file_pipelines_map = defaultdict(list)
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
-        # We only need to run one cloud per test function.
-        pipeline = _generate_pipeline(test_file, True)
+        pipeline = _generate_pipeline(test_file)
         output_file_pipelines_map[yaml_file_path].append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
@@ -208,7 +204,7 @@ def _convert_pre_merge(test_files: List[str]):
         # We want enable all clouds by default for each test function
         # for pre-merge. And let the author controls which clouds
         # to run by parameter.
-        pipeline = _generate_pipeline(test_file, False)
+        pipeline = _generate_pipeline(test_file)
         pipeline['steps'].append({
             'label': 'Backward compatibility test',
             'command': 'bash tests/backward_compatibility_tests.sh',

From 4428c90115ac63de71e1661e1f92e0837be88d26 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Tue, 10 Dec 2024 11:17:26 +0800
Subject: [PATCH 56/64] resolve PR comment

---
 .buildkite/generate_pipeline.py               | 86 +++++++++++--------
 ...ired_before_merge.py => test_pre_merge.py} |  0
 2 files changed, 50 insertions(+), 36 deletions(-)
 rename tests/smoke_tests/{test_required_before_merge.py => test_pre_merge.py} (100%)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index d3070ab91f8..3446d7d683f 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -5,27 +5,23 @@
 
 tests/smoke_tests
 ├── test_*.py -> release pipeline
-├── test_required_before_merge.py -> pre-merge pipeline
+├── test_pre_merge.py -> pre-merge pipeline
 
 run `python .buildkite/generate_pipeline.py` to generate the pipeline for
 testing. The CI will run this script as a pre-step, and use the generated
 pipeline to run the tests.
 
-1. release pipeline, which runs all smoke tests by default, some function
-   support tests by multiple clouds, but we only generate one cloud per test
-   function to save cost.
-2. pre-merge pipeline, which generates all clouds supported by the test
-   function, author should specify which clouds to run by setting env in the
-   step.
+1. release pipeline, which runs all smoke tests by default, generates all
+   smoke tests for all clouds.
+2. pre-merge pipeline, which generates all smoke tests for all clouds,
+   author should specify which clouds to run by setting env in the step.
 
-We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP) now,
-smoke tests for those clouds are generated, other clouds are not supported
-yet, smoke tests for those clouds are not generated.
+We only have credentials for aws/azure/gcp/kubernetes(CLOUD_QUEUE_MAP and
+SERVE_CLOUD_QUEUE_MAP) now, smoke tests for those clouds are generated, other
+clouds are not supported yet, smoke tests for those clouds are not generated.
 """
 
 import ast
-from collections import defaultdict
-import copy
 import os
 import random
 from typing import Any, Dict, List, Optional
@@ -78,8 +74,19 @@ def _get_full_decorator_path(decorator: ast.AST) -> str:
 
 
 def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
-    """Extract test functions and filter clouds with pytest.mark
-    from a Python test file."""
+    """Extract test functions and filter clouds using pytest.mark
+    from a Python test file.
+
+    We separate each test_function_{cloud} into different pipeline steps
+    to maximize the parallelism of the tests via the buildkite CI job queue.
+    This allows us to visualize the test results and rerun failures at the
+    granularity of each test_function_{cloud}.
+
+    If we make pytest --serve a job, it could contain dozens of test_functions
+    and run for hours. This makes it hard to visualize the test results and
+    rerun failures. Additionally, the parallelism would be controlled by pytest
+    instead of the buildkite job queue.
+    """
     with open(file_path, 'r', encoding='utf-8') as file:
         tree = ast.parse(file.read(), filename=file_path)
 
@@ -118,7 +125,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                             continue
                         clouds_to_include.append(suffix)
             clouds_to_include = (clouds_to_include if clouds_to_include else
-                                 copy.deepcopy(DEFAULT_CLOUDS_TO_RUN))
+                                 DEFAULT_CLOUDS_TO_RUN)
             clouds_to_include = [
                 cloud for cloud in clouds_to_include
                 if cloud not in clouds_to_exclude
@@ -133,6 +140,14 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                       f'but we do not have credentials for those clouds. '
                       f'Skipped.')
                 continue
+            if clouds_to_include != final_clouds_to_include:
+                excluded_clouds = set(clouds_to_include) - set(
+                    final_clouds_to_include)
+                print(
+                    f'Warning: {file_path}:{node.name} '
+                    f'is marked to run on {clouds_to_include}, '
+                    f'but we only have credentials for {final_clouds_to_include}. '
+                    f'clouds {excluded_clouds} are skipped.')
             function_name = (f'{class_name}::{node.name}'
                              if class_name else node.name)
             function_cloud_map[function_name] = (final_clouds_to_include, [
@@ -162,43 +177,41 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
     return {'steps': steps}
 
 
-def _dump_pipeline_to_file(output_file_pipelines_map: Dict[str,
-                                                           List[Dict[str,
-                                                                     Any]]],
+def _dump_pipeline_to_file(yaml_file_path: str,
+                           pipelines: List[Dict[str, Any]],
                            extra_env: Optional[Dict[str, str]] = None):
     default_env = {'LOG_TO_STDOUT': '1', 'PYTHONPATH': '${PYTHONPATH}:$(pwd)'}
     if extra_env:
         default_env.update(extra_env)
-
-    for yaml_file_path, pipelines in output_file_pipelines_map.items():
-        with open(yaml_file_path, 'w', encoding='utf-8') as file:
-            file.write(GENERATED_FILE_HEAD)
-            all_steps = []
-            for pipeline in pipelines:
-                all_steps.extend(pipeline['steps'])
-            # Shuffle the steps to avoid flakyness, consecutive runs of the same
-            # kind of test may fail for requiring locks on the same resources.
-            random.shuffle(all_steps)
-            final_pipeline = {'steps': all_steps, 'env': default_env}
-            yaml.dump(final_pipeline, file, default_flow_style=False)
+    with open(yaml_file_path, 'w', encoding='utf-8') as file:
+        file.write(GENERATED_FILE_HEAD)
+        all_steps = []
+        for pipeline in pipelines:
+            all_steps.extend(pipeline['steps'])
+        # Shuffle the steps to avoid flakyness, consecutive runs of the same
+        # kind of test may fail for requiring locks on the same resources.
+        random.shuffle(all_steps)
+        final_pipeline = {'steps': all_steps, 'env': default_env}
+        yaml.dump(final_pipeline, file, default_flow_style=False)
 
 
 def _convert_release(test_files: List[str]):
     yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
-    output_file_pipelines_map = defaultdict(list)
+    output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
         pipeline = _generate_pipeline(test_file)
-        output_file_pipelines_map[yaml_file_path].append(pipeline)
+        output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
-    _dump_pipeline_to_file(output_file_pipelines_map,
+    _dump_pipeline_to_file(yaml_file_path,
+                           output_file_pipelines,
                            extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP})
 
 
 def _convert_pre_merge(test_files: List[str]):
     yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
-    output_file_pipelines_map = defaultdict(list)
+    output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
         # We want enable all clouds by default for each test function
@@ -213,9 +226,10 @@ def _convert_pre_merge(test_files: List[str]):
             },
             'if': 'build.env("aws") == "1"'
         })
-        output_file_pipelines_map[yaml_file_path].append(pipeline)
+        output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
-    _dump_pipeline_to_file(output_file_pipelines_map,
+    _dump_pipeline_to_file(yaml_file_path,
+                           output_file_pipelines,
                            extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
 
 
diff --git a/tests/smoke_tests/test_required_before_merge.py b/tests/smoke_tests/test_pre_merge.py
similarity index 100%
rename from tests/smoke_tests/test_required_before_merge.py
rename to tests/smoke_tests/test_pre_merge.py

From ce550e70087fb7d310685ddab2d77a43f56fe122 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Tue, 10 Dec 2024 12:26:20 +0800
Subject: [PATCH 57/64] update comment

---
 tests/test_smoke.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 50824da7ec1..2d0f7605bc4 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -25,9 +25,8 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 """
-# This is the content that will be used in the future.
 # Currently copy back the tests/smoke_tests/* to tests/test_smoke.py for review.
-# After review, we will remove the copy back part and use content below.
+# After review, we will remove all contents in this file and use content below.
 
 # All files categorized under tests/smoke_tests/*
 # Please add new test cases under that directory.

From e389780c4a355d25e510dd65d1fa7ccdd8426f53 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Tue, 10 Dec 2024 12:39:21 +0800
Subject: [PATCH 58/64] naming fix

---
 .buildkite/generate_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 3446d7d683f..c2570ec465e 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -241,7 +241,7 @@ def main():
         if not test_file.startswith('test_'):
             continue
         test_file_path = os.path.join('tests/smoke_tests', test_file)
-        if "required_before_merge" in test_file:
+        if "test_pre_merge" in test_file:
             pre_merge_files.append(test_file_path)
         else:
             release_files.append(test_file_path)

From 74b2d6e2cb66d35b1934f0a86c44d298198058f0 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Tue, 10 Dec 2024 12:42:53 +0800
Subject: [PATCH 59/64] grammar correction

---
 sky/sky_logging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/sky_logging.py b/sky/sky_logging.py
index 944cbcf46d4..effeab310d8 100644
--- a/sky/sky_logging.py
+++ b/sky/sky_logging.py
@@ -77,7 +77,7 @@ def _setup_logger():
     # being propagated to the parent logger.
     _root_logger.propagate = False
     if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
-        # If the sensitive log is enabled, we re init a new handler
+        # If the sensitive log is enabled, we reinitialize a new handler
         # and force set the level to INFO to suppress the debug logs
         # for certain loggers.
         for logger_name in _SENSITIVE_LOGGER:

From 595c0431261b08e9fd077584f6123ffb3f5baab0 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 11 Dec 2024 18:17:59 +0800
Subject: [PATCH 60/64] resolve PR comment

---
 .buildkite/generate_pipeline.py               |   16 +-
 tests/backward_compatibility_tests.sh         |    2 +
 tests/smoke_tests/test_pre_merge.py           |    2 +-
 tests/test_smoke.py                           | 5770 -----------------
 ...merge.yaml => minimal_test_pre_merge.yaml} |    2 +-
 5 files changed, 14 insertions(+), 5778 deletions(-)
 rename tests/test_yamls/{minimal_test_required_before_merge.yaml => minimal_test_pre_merge.yaml} (60%)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index c2570ec465e..8f1389d409a 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -24,17 +24,21 @@
 import ast
 import os
 import random
+import sys
 from typing import Any, Dict, List, Optional
 
 import yaml
 
-DEFAULT_CLOUDS_TO_RUN = ['aws', 'azure']
+# Add project root to Python path
+tests_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'tests')
+sys.path.append(tests_path)
+
+from conftest import all_clouds_in_smoke_tests
+from conftest import default_clouds_to_run
+
+DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run
+ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests
 
-ALL_CLOUDS_IN_SMOKE_TESTS = [
-    'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
-    'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace', 'runpod',
-    'lambda_cloud'
-]
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
 QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
 QUEUE_KUBERNETES = 'kubernetes'
diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh
index 511b2c9ba6b..d32e1e9e224 100644
--- a/tests/backward_compatibility_tests.sh
+++ b/tests/backward_compatibility_tests.sh
@@ -36,6 +36,7 @@ cd ../sky-master
 git pull origin master
 pip uninstall -y skypilot
 pip install uv
+uv pip install --prerelease=allow "azure-cli>=2.65.0"
 uv pip install -e ".[all]"
 cd -
 
@@ -45,6 +46,7 @@ conda install -c conda-forge google-cloud-sdk -y
 rm -r  ~/.sky/wheels || true
 pip uninstall -y skypilot
 pip install uv
+uv pip install --prerelease=allow "azure-cli>=2.65.0"
 uv pip install -e ".[all]"
 
 
diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py
index 1d68b8a81e6..a2da638b8de 100644
--- a/tests/smoke_tests/test_pre_merge.py
+++ b/tests/smoke_tests/test_pre_merge.py
@@ -29,7 +29,7 @@ def test_yaml_launch_and_mount(generic_cloud: str):
     test = smoke_tests_utils.Test(
         'test_yaml_launch_and_mount',
         [
-            f'sky launch -y -c {name} tests/test_yamls/minimal_test_required_before_merge.yaml',
+            f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml',
             smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 2d0f7605bc4..b33c1d80bce 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -24,12 +24,6 @@
 #
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
-"""
-# Currently copy back the tests/smoke_tests/* to tests/test_smoke.py for review.
-# After review, we will remove all contents in this file and use content below.
-
-# All files categorized under tests/smoke_tests/*
-# Please add new test cases under that directory.
 
 from smoke_tests.test_basic import *
 from smoke_tests.test_cluster_job import *
@@ -38,5767 +32,3 @@
 from smoke_tests.test_mount_and_storage import *
 from smoke_tests.test_region_and_zone import *
 from smoke_tests.test_sky_serve import *
-"""
-import inspect
-import json
-import os
-import pathlib
-import shlex
-import shutil
-import subprocess
-import tempfile
-import textwrap
-import time
-from typing import Dict, List, Optional, TextIO, Tuple
-import urllib.parse
-import uuid
-
-import jinja2
-import pytest
-from smoke_tests import smoke_tests_utils
-
-import sky
-from sky import global_user_state
-from sky import jobs
-from sky import serve
-from sky import skypilot_config
-from sky.adaptors import azure
-from sky.adaptors import cloudflare
-from sky.adaptors import ibm
-from sky.clouds import AWS
-from sky.clouds import Azure
-from sky.clouds import GCP
-from sky.data import data_utils
-from sky.data import storage as storage_lib
-from sky.data.data_utils import Rclone
-from sky.skylet import constants
-from sky.skylet import events
-from sky.utils import common_utils
-from sky.utils import resources_utils
-
-
-# ---------- Dry run: 2 Tasks in a chain. ----------
-@pytest.mark.no_fluidstack  #requires GCP and AWS set up
-def test_example_app():
-    test = smoke_tests_utils.Test(
-        'example_app',
-        ['python examples/example_app.py'],
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- A minimal task ----------
-def test_minimal(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'minimal',
-        [
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            # Output validation done.
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
-            # Test launch output again on existing cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            # Check the logs downloading
-            f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log',
-            # Ensure the raylet process has the correct file descriptor limit.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # Install jq for the next test.
-            f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'',
-            # Check the cluster info
-            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'',
-            f'sky logs {name} 5 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
-            f'sky logs {name} 6 --status',  # Ensure the job succeeded.
-            # Test '-c' for exec
-            f'sky exec -c {name} echo',
-            f'sky logs {name} 7 --status',
-            f'sky exec echo -c {name}',
-            f'sky logs {name} 8 --status',
-            f'sky exec -c {name} echo hi test',
-            f'sky logs {name} 9 | grep "hi test"',
-            f'sky exec {name} && exit 1 || true',
-            f'sky exec -c {name} && exit 1 || true',
-        ],
-        f'sky down -y {name}',
-        smoke_tests_utils.get_timeout(generic_cloud),
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Test fast launch ----------
-def test_launch_fast(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-
-    test = smoke_tests_utils.Test(
-        'test_launch_fast',
-        [
-            # First launch to create the cluster
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 1 --status',
-
-            # Second launch to test fast launch - should not reprovision
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && '
-            ' echo "$s" && '
-            # Validate that cluster was not re-launched.
-            '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
-            # Validate that setup was not re-run.
-            '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
-            # Validate that the task ran and finished.
-            'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=smoke_tests_utils.get_timeout(generic_cloud),
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# See cloud exclusion explanations in test_autostop
-@pytest.mark.no_fluidstack
-@pytest.mark.no_lambda_cloud
-@pytest.mark.no_ibm
-@pytest.mark.no_kubernetes
-def test_launch_fast_with_autostop(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
-    # the VM is stopped.
-    autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    test = smoke_tests_utils.Test(
-        'test_launch_fast_with_autostop',
-        [
-            # First launch to create the cluster with a short autostop
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 1 --status',
-            f'sky status -r {name} | grep UP',
-
-            # Ensure cluster is stopped
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout),
-            # Even the cluster is stopped, cloud platform may take a while to
-            # delete the VM.
-            f'sleep {smoke_tests_utils.BUMP_UP_SECONDS}',
-            # Launch again. Do full output validation - we expect the cluster to re-launch
-            f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=smoke_tests_utils.get_timeout(generic_cloud) + autostop_timeout,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Test region ----------
-@pytest.mark.aws
-def test_aws_region():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_region',
-        [
-            f'sky launch -y -c {name} --region us-east-2 examples/minimal.yaml',
-            f'sky exec {name} examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_with_ssh_proxy_command():
-    name = smoke_tests_utils.get_cluster_name()
-
-    with tempfile.NamedTemporaryFile(mode='w') as f:
-        f.write(
-            textwrap.dedent(f"""\
-        aws:
-            ssh_proxy_command: ssh -W %h:%p -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null jump-{name}
-        """))
-        f.flush()
-        test = smoke_tests_utils.Test(
-            'aws_with_ssh_proxy_command',
-            [
-                f'sky launch -y -c jump-{name} --cloud aws --cpus 2 --region us-east-1',
-                # Use jump config
-                f'export SKYPILOT_CONFIG={f.name}; '
-                f'sky launch -y -c {name} --cloud aws --cpus 2 --region us-east-1 echo hi',
-                f'sky logs {name} 1 --status',
-                f'export SKYPILOT_CONFIG={f.name}; sky exec {name} echo hi',
-                f'sky logs {name} 2 --status',
-                # Start a small job to make sure the controller is created.
-                f'sky jobs launch -n {name}-0 --cloud aws --cpus 2 --use-spot -y echo hi',
-                # Wait other tests to create the job controller first, so that
-                # the job controller is not launched with proxy command.
-                smoke_tests_utils.
-                get_cmd_wait_until_cluster_status_contains_wildcard(
-                    cluster_name_wildcard='sky-jobs-controller-*',
-                    cluster_status=[sky.ClusterStatus.UP],
-                    timeout=300),
-                f'export SKYPILOT_CONFIG={f.name}; sky jobs launch -n {name} --cpus 2 --cloud aws --region us-east-1 -yd echo hi',
-                smoke_tests_utils.
-                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                    job_name=name,
-                    job_status=[
-                        sky.ManagedJobStatus.SUCCEEDED,
-                        sky.ManagedJobStatus.RUNNING,
-                        sky.ManagedJobStatus.STARTING
-                    ],
-                    timeout=300),
-            ],
-            f'sky down -y {name} jump-{name}; sky jobs cancel -y -n {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_region_and_service_account():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_region',
-        [
-            f'sky launch -y -c {name} --region us-central1 --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-central1',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_region():
-    name = smoke_tests_utils.get_cluster_name()
-    region = 'eu-de'
-    test = smoke_tests_utils.Test(
-        'region',
-        [
-            f'sky launch -y -c {name} --cloud ibm --region {region} examples/minimal.yaml',
-            f'sky exec {name} --cloud ibm examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep {region}',  # Ensure the region is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_region():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'azure_region',
-        [
-            f'sky launch -y -c {name} --region eastus2 --cloud azure tests/test_yamls/minimal.yaml',
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep eastus2',  # Ensure the region is correct.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            # A user program should not access SkyPilot runtime env python by default.
-            f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Test zone ----------
-@pytest.mark.aws
-def test_aws_zone():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_zone',
-        [
-            f'sky launch -y -c {name} examples/minimal.yaml --zone us-east-2b',
-            f'sky exec {name} examples/minimal.yaml --zone us-east-2b',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-east-2b',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_zone():
-    name = smoke_tests_utils.get_cluster_name()
-    zone = 'eu-de-2'
-    test = smoke_tests_utils.Test(
-        'zone',
-        [
-            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml --zone {zone}',
-            f'sky exec {name} --cloud ibm examples/minimal.yaml --zone {zone}',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep {zone}',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name} {name}-2 {name}-3',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_zone():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_zone',
-        [
-            f'sky launch -y -c {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --zone us-central1-a --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status --all | grep {name} | grep us-central1-a',  # Ensure the zone is correct.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Test the image ----------
-@pytest.mark.aws
-def test_aws_images():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} examples/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_images():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:cpu-debian-10 --cloud gcp tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_images():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'azure_images',
-        [
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2204 --cloud azure tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -c {name} --image-id skypilot:v1-ubuntu-2004 --cloud azure tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_image_id_dict',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_image_id_dict',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict_region():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_image_id_dict_region',
-        [
-            # YAML has
-            #   image_id:
-            #       us-west-2: skypilot:gpu-ubuntu-1804
-            #       us-east-2: skypilot:gpu-ubuntu-2004
-            # Use region to filter image_id dict.
-            f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
-            # Should success because the image id match for the region.
-            f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-east-2',  # Ensure the region is correct.
-            # Ensure exec works.
-            f'sky exec {name} --region us-east-2 examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict_region():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_image_id_dict_region',
-        [
-            # Use region to filter image_id dict.
-            f'sky launch -y -c {name} --region us-east1 tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
-            # Should success because the image id match for the region.
-            f'sky launch -c {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20230112 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-west3',  # Ensure the region is correct.
-            # Ensure exec works.
-            f'sky exec {name} --region us-west3 tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} --cloud gcp --region us-west3 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_image_id_dict_zone():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_image_id_dict_zone',
-        [
-            # YAML has
-            #   image_id:
-            #       us-west-2: skypilot:gpu-ubuntu-1804
-            #       us-east-2: skypilot:gpu-ubuntu-2004
-            # Use zone to filter image_id dict.
-            f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --zone us-east-2a examples/per_region_images.yaml',
-            # Should success because the image id match for the zone.
-            f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml',
-            # Fail due to image id mismatch.
-            f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-east-2a',  # Ensure the zone is correct.
-            # Ensure exec works.
-            f'sky exec {name} --zone us-east-2a examples/per_region_images.yaml',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky exec {name} --cloud aws --region us-east-2 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_image_id_dict_zone():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_image_id_dict_zone',
-        [
-            # Use zone to filter image_id dict.
-            f'sky launch -y -c {name} --zone us-east1-a tests/test_yamls/gcp_per_region_images.yaml && exit 1 || true',
-            f'sky status | grep {name} && exit 1 || true',  # Ensure the cluster is not created.
-            f'sky launch -y -c {name} --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
-            # Should success because the image id match for the zone.
-            f'sky launch -y -c {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
-            f'sky exec {name} --cloud gcp --image-id skypilot:cpu-debian-10 tests/test_yamls/minimal.yaml',
-            # Fail due to image id mismatch.
-            f'sky exec {name} --cloud gcp --image-id skypilot:gpu-debian-10 tests/test_yamls/minimal.yaml && exit 1 || true',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 --status',
-            f'sky status --all | grep {name} | grep us-central1',  # Ensure the zone is correct.
-            # Ensure exec works.
-            f'sky exec {name} --cloud gcp --zone us-central1-a tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} tests/test_yamls/gcp_per_region_images.yaml',
-            f'sky exec {name} --cloud gcp --region us-central1 "ls ~"',
-            f'sky exec {name} "ls ~"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_clone_disk_aws():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'clone_disk_aws',
-        [
-            f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
-            f'sky stop {name} -y',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=60),
-            # Wait for EC2 instance to be in stopped state.
-            # TODO: event based wait.
-            'sleep 60',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
-            f'sky logs {name}-clone 1 --status',
-            f'sky logs {name}-clone-2 1 --status',
-        ],
-        f'sky down -y {name} {name}-clone {name}-clone-2',
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_clone_disk_gcp():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'clone_disk_gcp',
-        [
-            f'sky launch -y -c {name} --cloud gcp --zone us-east1-b --retry-until-up "echo hello > ~/user_file.txt"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
-            f'sky stop {name} -y',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud gcp --zone us-central1-a "cat ~/user_file.txt | grep hello"',
-            f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud gcp --zone us-east1-b "cat ~/user_file.txt | grep hello"',
-            f'sky logs {name}-clone 1 --status',
-            f'sky logs {name}-clone-2 1 --status',
-        ],
-        f'sky down -y {name} {name}-clone {name}-clone-2',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_mig():
-    name = smoke_tests_utils.get_cluster_name()
-    region = 'us-central1'
-    test = smoke_tests_utils.Test(
-        'gcp_mig',
-        [
-            f'sky launch -y -c {name} --gpus t4 --num-nodes 2 --image-id skypilot:gpu-debian-10 --cloud gcp --region {region} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} --status | grep "Job 2: SUCCEEDED"',  # Equivalent.
-            # Check MIG exists.
-            f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
-            f'sky autostop -i 0 --down -y {name}',
-            smoke_tests_utils.get_cmd_wait_until_cluster_is_not_found(
-                cluster_name=name, timeout=120),
-            f'gcloud compute instance-templates list | grep "sky-it-{name}"',
-            # Launch again with the same region. The original instance template
-            # should be removed.
-            f'sky launch -y -c {name} --gpus L4 --num-nodes 2 --region {region} nvidia-smi',
-            f'sky logs {name} 1 | grep "L4"',
-            f'sky down -y {name}',
-            f'gcloud compute instance-templates list | grep "sky-it-{name}" && exit 1 || true',
-        ],
-        f'sky down -y {name}',
-        env={'SKYPILOT_CONFIG': 'tests/test_yamls/use_mig_config.yaml'})
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_force_enable_external_ips():
-    name = smoke_tests_utils.get_cluster_name()
-    test_commands = [
-        f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
-        # Check network of vm is "default"
-        (f'gcloud compute instances list --filter=name~"{name}" --format='
-         '"value(networkInterfaces.network)" | grep "networks/default"'),
-        # Check External NAT in network access configs, corresponds to external ip
-        (f'gcloud compute instances list --filter=name~"{name}" --format='
-         '"value(networkInterfaces.accessConfigs[0].name)" | grep "External NAT"'
-        ),
-        f'sky down -y {name}',
-    ]
-    skypilot_config = 'tests/test_yamls/force_enable_external_ips_config.yaml'
-    test = smoke_tests_utils.Test('gcp_force_enable_external_ips',
-                                  test_commands,
-                                  f'sky down -y {name}',
-                                  env={'SKYPILOT_CONFIG': skypilot_config})
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_image_no_conda():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'image_no_conda',
-        [
-            # Use image id dict.
-            f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
-            f'sky logs {name} 1 --status',
-            f'sky stop {name} -y',
-            f'sky start {name} -y',
-            f'sky exec {name} examples/per_region_images.yaml',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
-@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
-def test_custom_default_conda_env(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test('custom_default_conda_env', [
-        f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky status -r {name} | grep "UP"',
-        f'sky logs {name} 1 --status',
-        f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"',
-        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky logs {name} 2 --status',
-        f'sky autostop -y -i 0 {name}',
-        smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-            cluster_name=name,
-            cluster_status=[sky.ClusterStatus.STOPPED],
-            timeout=80),
-        f'sky start -y {name}',
-        f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
-        f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
-        f'sky logs {name} 3 --status',
-    ], f'sky down -y {name}')
-    smoke_tests_utils.run_one_test(test)
-
-
-# ------------ Test stale job ------------
-# ------------ Test stale job ------------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping instances in SkyPilot implementation
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
-@pytest.mark.no_kubernetes  # Kubernetes does not support stopping instances
-def test_stale_job(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'stale_job',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            f'sky stop {name} -y',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=100),
-            f'sky start {name} -y',
-            f'sky logs {name} 1 --status',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED_DRIVER',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_stale_job_manual_restart():
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.AWS.max_cluster_name_length())
-    region = 'us-east-2'
-    test = smoke_tests_utils.Test(
-        'aws_stale_job_manual_restart',
-        [
-            f'sky launch -y -c {name} --cloud aws --region {region} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            # Stop the cluster manually.
-            f'id=`aws ec2 describe-instances --region {region} --filters '
-            f'Name=tag:ray-cluster-name,Values={name_on_cloud} '
-            f'--query Reservations[].Instances[].InstanceId '
-            '--output text`; '
-            f'aws ec2 stop-instances --region {region} '
-            '--instance-ids $id',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=40),
-            f'sky launch -c {name} -y "echo hi"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 3 --status',
-            # Ensure the skylet updated the stale job status.
-            smoke_tests_utils.
-            get_cmd_wait_until_job_status_contains_without_matching_job(
-                cluster_name=name,
-                job_status=[sky.JobStatus.FAILED_DRIVER],
-                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_stale_job_manual_restart():
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.GCP.max_cluster_name_length())
-    zone = 'us-west2-a'
-    query_cmd = (f'gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name={name_on_cloud})" '
-                 f'--zones={zone} --format="value(name)"')
-    stop_cmd = (f'gcloud compute instances stop --zone={zone}'
-                f' --quiet $({query_cmd})')
-    test = smoke_tests_utils.Test(
-        'gcp_stale_job_manual_restart',
-        [
-            f'sky launch -y -c {name} --cloud gcp --zone {zone} "echo hi"',
-            f'sky exec {name} -d "echo start; sleep 10000"',
-            # Stop the cluster manually.
-            stop_cmd,
-            'sleep 40',
-            f'sky launch -c {name} -y "echo hi"',
-            f'sky logs {name} 1 --status',
-            f'sky logs {name} 3 --status',
-            # Ensure the skylet updated the stale job status.
-            smoke_tests_utils.
-            get_cmd_wait_until_job_status_contains_without_matching_job(
-                cluster_name=name,
-                job_status=[sky.JobStatus.FAILED_DRIVER],
-                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Check Sky's environment variables; workdir. ----------
-@pytest.mark.no_fluidstack  # Requires amazon S3
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_env_check(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = smoke_tests_utils.Test(
-        'env_check',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- file_mounts ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead.
-def test_file_mounts(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    extra_flags = ''
-    if generic_cloud in 'kubernetes':
-        # Kubernetes does not support multi-node
-        # NOTE: This test will fail if you have a Kubernetes cluster running on
-        #  arm64 (e.g., Apple Silicon) since goofys does not work on arm64.
-        extra_flags = '--num-nodes 1'
-    test_commands = [
-        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {name} --cloud {generic_cloud} {extra_flags} examples/using_file_mounts.yaml',
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-    ]
-    test = smoke_tests_utils.Test(
-        'using_file_mounts',
-        test_commands,
-        f'sky down -y {name}',
-        smoke_tests_utils.get_timeout(generic_cloud, 20 * 60),  # 20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_file_mounts():
-    name = smoke_tests_utils.get_cluster_name()
-    test_commands = [
-        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} --num-nodes 1 examples/using_file_mounts.yaml',
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-    ]
-    test = smoke_tests_utils.Test(
-        'SCP_using_file_mounts',
-        test_commands,
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Requires GCP to be enabled
-def test_using_file_mounts_with_env_vars(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    storage_name = TestStorageWithCredentials.generate_bucket_name()
-    test_commands = [
-        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-        (f'sky launch -y -c {name} --cpus 2+ --cloud {generic_cloud} '
-         'examples/using_file_mounts_with_env_vars.yaml '
-         f'--env MY_BUCKET={storage_name}'),
-        f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        # Override with --env:
-        (f'sky launch -y -c {name}-2 --cpus 2+ --cloud {generic_cloud} '
-         'examples/using_file_mounts_with_env_vars.yaml '
-         f'--env MY_BUCKET={storage_name} '
-         '--env MY_LOCAL_PATH=tmpfile'),
-        f'sky logs {name}-2 1 --status',  # Ensure the job succeeded.
-    ]
-    test = smoke_tests_utils.Test(
-        'using_file_mounts_with_env_vars',
-        test_commands,
-        (f'sky down -y {name} {name}-2',
-         f'sky storage delete -y {storage_name} {storage_name}-2'),
-        timeout=20 * 60,  # 20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- storage ----------
-def _storage_mounts_commands_generator(f: TextIO, cluster_name: str,
-                                       storage_name: str, ls_hello_command: str,
-                                       cloud: str, only_mount: bool):
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(
-        storage_name=storage_name,
-        cloud=cloud,
-        only_mount=only_mount,
-    )
-    f.write(content)
-    f.flush()
-    file_path = f.name
-    test_commands = [
-        *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {cluster_name} --cloud {cloud} {file_path}',
-        f'sky logs {cluster_name} 1 --status',  # Ensure job succeeded.
-        ls_hello_command,
-        f'sky stop -y {cluster_name}',
-        f'sky start -y {cluster_name}',
-        # Check if hello.txt from mounting bucket exists after restart in
-        # the mounted directory
-        f'sky exec {cluster_name} -- "set -ex; ls /mount_private_mount/hello.txt"',
-    ]
-    clean_command = f'sky down -y {cluster_name}; sky storage delete -y {storage_name}'
-    return test_commands, clean_command
-
-
-@pytest.mark.aws
-def test_aws_storage_mounts_with_stop():
-    name = smoke_tests_utils.get_cluster_name()
-    cloud = 'aws'
-    storage_name = f'sky-test-{int(time.time())}'
-    ls_hello_command = f'aws s3 ls {storage_name}/hello.txt'
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test_commands, clean_command = _storage_mounts_commands_generator(
-            f, name, storage_name, ls_hello_command, cloud, False)
-        test = smoke_tests_utils.Test(
-            'aws_storage_mounts',
-            test_commands,
-            clean_command,
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-def test_aws_storage_mounts_with_stop_only_mount():
-    name = smoke_tests_utils.get_cluster_name()
-    cloud = 'aws'
-    storage_name = f'sky-test-{int(time.time())}'
-    ls_hello_command = f'aws s3 ls {storage_name}/hello.txt'
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test_commands, clean_command = _storage_mounts_commands_generator(
-            f, name, storage_name, ls_hello_command, cloud, True)
-        test = smoke_tests_utils.Test(
-            'aws_storage_mounts_only_mount',
-            test_commands,
-            clean_command,
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_storage_mounts_with_stop():
-    name = smoke_tests_utils.get_cluster_name()
-    cloud = 'gcp'
-    storage_name = f'sky-test-{int(time.time())}'
-    ls_hello_command = f'gsutil ls gs://{storage_name}/hello.txt'
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test_commands, clean_command = _storage_mounts_commands_generator(
-            f, name, storage_name, ls_hello_command, cloud, False)
-        test = smoke_tests_utils.Test(
-            'gcp_storage_mounts',
-            test_commands,
-            clean_command,
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_storage_mounts_with_stop():
-    name = smoke_tests_utils.get_cluster_name()
-    cloud = 'azure'
-    storage_name = f'sky-test-{int(time.time())}'
-    default_region = 'eastus'
-    storage_account_name = (storage_lib.AzureBlobStore.
-                            get_default_storage_account_name(default_region))
-    storage_account_key = data_utils.get_az_storage_account_key(
-        storage_account_name)
-    # if the file does not exist, az storage blob list returns '[]'
-    ls_hello_command = (f'output=$(az storage blob list -c {storage_name} '
-                        f'--account-name {storage_account_name} '
-                        f'--account-key {storage_account_key} '
-                        f'--prefix hello.txt) '
-                        f'[ "$output" = "[]" ] && exit 1 || exit 0')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test_commands, clean_command = _storage_mounts_commands_generator(
-            f, name, storage_name, ls_hello_command, cloud, False)
-        test = smoke_tests_utils.Test(
-            'azure_storage_mounts',
-            test_commands,
-            clean_command,
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_storage_mounts():
-    # Tests bucket mounting on k8s, assuming S3 is configured.
-    # This test will fail if run on non x86_64 architecture, since goofys is
-    # built for x86_64 only.
-    name = smoke_tests_utils.get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    ls_hello_command = (f'aws s3 ls {storage_name}/hello.txt || '
-                        f'gsutil ls gs://{storage_name}/hello.txt')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test_commands, clean_command = _storage_mounts_commands_generator(
-            f, name, storage_name, ls_hello_command, 'kubernetes', False)
-        test = smoke_tests_utils.Test(
-            'kubernetes_storage_mounts',
-            test_commands,
-            clean_command,
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_context_switch():
-    name = smoke_tests_utils.get_cluster_name()
-    new_context = f'sky-test-context-{int(time.time())}'
-    new_namespace = f'sky-test-namespace-{int(time.time())}'
-
-    test_commands = [
-        # Launch a cluster and run a simple task
-        f'sky launch -y -c {name} --cloud kubernetes "echo Hello from original context"',
-        f'sky logs {name} 1 --status',  # Ensure job succeeded
-
-        # Get current context details and save to a file for later use in cleanup
-        'CURRENT_CONTEXT=$(kubectl config current-context); '
-        'echo "$CURRENT_CONTEXT" > /tmp/sky_test_current_context; '
-        'CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.cluster}"); '
-        'CURRENT_USER=$(kubectl config view -o jsonpath="{.contexts[?(@.name==\\"$CURRENT_CONTEXT\\")].context.user}"); '
-
-        # Create a new context with a different name and namespace
-        f'kubectl config set-context {new_context} --cluster="$CURRENT_CLUSTER" --user="$CURRENT_USER" --namespace={new_namespace}',
-
-        # Create the new namespace if it doesn't exist
-        f'kubectl create namespace {new_namespace} --dry-run=client -o yaml | kubectl apply -f -',
-
-        # Set the new context as active
-        f'kubectl config use-context {new_context}',
-
-        # Verify the new context is active
-        f'[ "$(kubectl config current-context)" = "{new_context}" ] || exit 1',
-
-        # Try to run sky exec on the original cluster (should still work)
-        f'sky exec {name} "echo Success: sky exec works after context switch"',
-
-        # Test sky queue
-        f'sky queue {name}',
-
-        # Test SSH access
-        f'ssh {name} whoami',
-    ]
-
-    cleanup_commands = (
-        f'kubectl delete namespace {new_namespace}; '
-        f'kubectl config delete-context {new_context}; '
-        'kubectl config use-context $(cat /tmp/sky_test_current_context); '
-        'rm /tmp/sky_test_current_context; '
-        f'sky down -y {name}')
-
-    test = smoke_tests_utils.Test(
-        'kubernetes_context_switch',
-        test_commands,
-        cleanup_commands,
-        timeout=20 * 60,  # 20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-    ])
-def test_docker_storage_mounts(generic_cloud: str, image_id: str):
-    # Tests bucket mounting on docker container
-    name = smoke_tests_utils.get_cluster_name()
-    timestamp = str(time.time()).replace('.', '')
-    storage_name = f'sky-test-{timestamp}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    # ubuntu 18.04 does not support fuse3, and blobfuse2 depends on fuse3.
-    azure_mount_unsupported_ubuntu_version = '18.04'
-    # Commands to verify bucket upload. We need to check all three
-    # storage types because the optimizer may pick any of them.
-    s3_command = f'aws s3 ls {storage_name}/hello.txt'
-    gsutil_command = f'gsutil ls gs://{storage_name}/hello.txt'
-    azure_blob_command = TestStorageWithCredentials.cli_ls_cmd(
-        storage_lib.StoreType.AZURE, storage_name, suffix='hello.txt')
-    if azure_mount_unsupported_ubuntu_version in image_id:
-        # The store for mount_private_mount is not specified in the template.
-        # If we're running on Azure, the private mount will be created on
-        # azure blob. That will not be supported on the ubuntu 18.04 image
-        # and thus fail. For other clouds, the private mount on other
-        # storage types (GCS/S3) should succeed.
-        include_private_mount = False if generic_cloud == 'azure' else True
-        content = template.render(storage_name=storage_name,
-                                  include_azure_mount=False,
-                                  include_private_mount=include_private_mount)
-    else:
-        content = template.render(storage_name=storage_name,)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            # Check AWS, GCP, or Azure storage mount.
-            f'{s3_command} || '
-            f'{gsutil_command} || '
-            f'{azure_blob_command}',
-        ]
-        test = smoke_tests_utils.Test(
-            'docker_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.cloudflare
-def test_cloudflare_storage_mounts(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_r2_storage_mounting.yaml').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
-    endpoint_url = cloudflare.create_endpoint()
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {generic_cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{storage_name}/hello.txt --endpoint {endpoint_url} --profile=r2'
-        ]
-
-        test = smoke_tests_utils.Test(
-            'cloudflare_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_storage_mounts():
-    name = smoke_tests_utils.get_cluster_name()
-    storage_name = f'sky-test-{int(time.time())}'
-    bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-        storage_name, Rclone.RcloneClouds.IBM)
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud ibm {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt',
-        ]
-        test = smoke_tests_utils.Test(
-            'ibm_storage_mounts',
-            test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
-            timeout=20 * 60,  # 20 mins
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- CLI logs ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead.
-def test_cli_logs(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    num_nodes = 2
-    if generic_cloud == 'kubernetes':
-        # Kubernetes does not support multi-node
-        num_nodes = 1
-    timestamp = time.time()
-    test = smoke_tests_utils.Test('cli_logs', [
-        f'sky launch -y -c {name} --cloud {generic_cloud} --num-nodes {num_nodes} "echo {timestamp} 1"',
-        f'sky exec {name} "echo {timestamp} 2"',
-        f'sky exec {name} "echo {timestamp} 3"',
-        f'sky exec {name} "echo {timestamp} 4"',
-        f'sky logs {name} 2 --status',
-        f'sky logs {name} 3 4 --sync-down',
-        f'sky logs {name} * --sync-down',
-        f'sky logs {name} 1 | grep "{timestamp} 1"',
-        f'sky logs {name} | grep "{timestamp} 4"',
-    ], f'sky down -y {name}')
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_logs():
-    name = smoke_tests_utils.get_cluster_name()
-    timestamp = time.time()
-    test = smoke_tests_utils.Test(
-        'SCP_cli_logs',
-        [
-            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} "echo {timestamp} 1"',
-            f'sky exec {name} "echo {timestamp} 2"',
-            f'sky exec {name} "echo {timestamp} 3"',
-            f'sky exec {name} "echo {timestamp} 4"',
-            f'sky logs {name} 2 --status',
-            f'sky logs {name} 3 4 --sync-down',
-            f'sky logs {name} * --sync-down',
-            f'sky logs {name} 1 | grep "{timestamp} 1"',
-            f'sky logs {name} | grep "{timestamp} 4"',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Job Queue. ----------
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue instead
-@pytest.mark.no_scp  # SCP does not have T4 gpus. Run test_scp_job_queue instead
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
-@pytest.mark.no_oci  # OCI does not have T4 gpus
-def test_job_queue(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'job_queue',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 -d examples/job_queue/job.yaml',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 4 --status',
-            f'sky logs {name} 5 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Job Queue with Docker. ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support docker for now
-@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
-@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
-@pytest.mark.no_paperspace  # Paperspace doesn't have T4 GPUs
-@pytest.mark.no_scp  # Doesn't support SCP for now
-@pytest.mark.no_oci  # Doesn't support OCI for now
-@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test latest image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-        # Axolotl image is a good example custom image that has its conda path
-        # set in PATH with dockerfile and uses python>=3.12. It could test:
-        #  1. we handle the env var set in dockerfile correctly
-        #  2. python>=3.12 works with SkyPilot runtime.
-        'docker:winglian/axolotl:main-latest'
-    ])
-def test_job_queue_with_docker(generic_cloud: str, image_id: str):
-    name = smoke_tests_utils.get_cluster_name() + image_id[len('docker:'):][:4]
-    total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
-    time_to_sleep = 300 if generic_cloud == 'azure' else 180
-    test = smoke_tests_utils.Test(
-        'job_queue_with_docker',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
-            f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-            # Make sure the GPU is still visible to the container.
-            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
-            f'sky logs {name} 4 --status',
-            f'sky stop -y {name}',
-            # Make sure the job status preserve after stop and start the
-            # cluster. This is also a test for the docker container to be
-            # preserved after stop and start.
-            f'sky start -y {name}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            # Make sure it is still visible after an stop & start cycle.
-            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
-            f'sky logs {name} 7 --status'
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.lambda_cloud
-def test_lambda_job_queue():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'lambda_job_queue',
-        [
-            f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_job_queue():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'ibm_job_queue',
-        [
-            f'sky launch -y -c {name} --cloud ibm --gpus v100',
-            f'sky exec {name} -n {name}-1 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky exec {name} -n {name}-2 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky exec {name} -n {name}-3 --cloud ibm -d examples/job_queue/job_ibm.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_job_queue():
-    name = smoke_tests_utils.get_cluster_name()
-    num_of_gpu_launch = 1
-    num_of_gpu_exec = 0.5
-    test = smoke_tests_utils.Test(
-        'SCP_job_queue',
-        [
-            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_exec} -d examples/job_queue/job.yaml',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky queue {name} | grep {name}-2 | grep RUNNING',
-            f'sky queue {name} | grep {name}-3 | grep PENDING',
-            f'sky cancel -y {name} 2',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 3',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus. run test_ibm_job_queue_multinode instead
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus.
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus.
-@pytest.mark.no_kubernetes  # Kubernetes not support num_nodes > 1 yet
-def test_job_queue_multinode(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
-    test = smoke_tests_utils.Test(
-        'job_queue_multinode',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml',
-            f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml',
-            f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml',
-            f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
-            'sleep 90',
-            f'sky cancel -y {name} 1',
-            'sleep 5',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP',
-            f'sky cancel -y {name} 1 2 3',
-            f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml',
-            # Test the job status is correctly set to SETTING_UP, during the setup is running,
-            # and the job can be cancelled during the setup.
-            'sleep 5',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
-            f'sky cancel -y {name} 4',
-            f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
-            f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
-@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
-def test_large_job_queue(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'large_job_queue',
-        [
-            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
-            f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done',
-            f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16',
-            'sleep 90',
-
-            # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running.
-            # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING.
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43',
-            # Make sure the jobs are scheduled in FIFO order
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
-                for i in range(1, 17)
-            ],
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
-                for i in range(17, 33)
-            ],
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING'
-                for i in range(33, 75)
-            ],
-            f'sky cancel -y {name} 33 35 37 39 17 18 19',
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED'
-                for i in range(33, 40, 2)
-            ],
-            'sleep 10',
-            *[
-                f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING'
-                for i in [34, 36, 38]
-            ],
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # No FluidStack VM has 8 CPUs
-@pytest.mark.no_lambda_cloud  # No Lambda Cloud VM has 8 CPUs
-def test_fast_large_job_queue(generic_cloud: str):
-    # This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'fast_large_job_queue',
-        [
-            f'sky launch -y -c {name} --cpus 8 --cloud {generic_cloud}',
-            f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done',
-            'sleep 60',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.ibm
-def test_ibm_job_queue_multinode():
-    name = smoke_tests_utils.get_cluster_name()
-    task_file = 'examples/job_queue/job_multinode_ibm.yaml'
-    test = smoke_tests_utils.Test(
-        'ibm_job_queue_multinode',
-        [
-            f'sky launch -y -c {name} --cloud ibm --gpus v100 --num-nodes 2',
-            f'sky exec {name} -n {name}-1 -d {task_file}',
-            f'sky exec {name} -n {name}-2 -d {task_file}',
-            f'sky launch -y -c {name} -n {name}-3 --detach-setup -d {task_file}',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)',
-            'sleep 90',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
-            f'sky cancel -y {name} 1',
-            'sleep 5',
-            f'sky queue {name} | grep {name}-3 | grep RUNNING',
-            f'sky cancel -y {name} 1 2 3',
-            f'sky launch -c {name} -n {name}-4 --detach-setup -d {task_file}',
-            # Test the job status is correctly set to SETTING_UP, during the setup is running,
-            # and the job can be cancelled during the setup.
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
-            f'sky cancel -y {name} 4',
-            f's=$(sky queue {name}) && printf "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
-            f'sky exec {name} --gpus v100:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus v100:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky exec {name} --gpus v100:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 5 --status',
-            f'sky logs {name} 6 --status',
-            f'sky logs {name} 7 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Docker with preinstalled package. ----------
-@pytest.mark.no_fluidstack  # Doesn't support Fluidstack for now
-@pytest.mark.no_lambda_cloud  # Doesn't support Lambda Cloud for now
-@pytest.mark.no_ibm  # Doesn't support IBM Cloud for now
-@pytest.mark.no_scp  # Doesn't support SCP for now
-@pytest.mark.no_oci  # Doesn't support OCI for now
-@pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
-# TODO(zhwu): we should fix this for kubernetes
-def test_docker_preinstalled_package(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'docker_with_preinstalled_package',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} --image-id docker:nginx',
-            f'sky exec {name} "nginx -V"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} whoami | grep root',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Submitting multiple tasks to the same cluster. ----------
-@pytest.mark.no_fluidstack  # FluidStack DC has low availability of T4 GPUs
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have T4 gpus
-@pytest.mark.no_paperspace  # Paperspace does not have T4 gpus
-@pytest.mark.no_ibm  # IBM Cloud does not have T4 gpus
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_oci  # OCI Cloud does not have T4 gpus
-def test_multi_echo(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multi_echo',
-        [
-            f'python examples/multi_echo.py {name} {generic_cloud}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 10',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 30',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            'sleep 30',
-            # Make sure that our job scheduler is fast enough to have at least
-            # 10 RUNNING jobs in parallel.
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'',
-            'sleep 30',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
-            f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done',
-        ] +
-        # Ensure jobs succeeded.
-        [
-            smoke_tests_utils.
-            get_cmd_wait_until_job_status_contains_matching_job_id(
-                cluster_name=name,
-                job_id=i + 1,
-                job_status=[sky.JobStatus.SUCCEEDED],
-                timeout=120) for i in range(32)
-        ] +
-        # Ensure monitor/autoscaler didn't crash on the 'assert not
-        # unfulfilled' error.  If process not found, grep->ssh returns 1.
-        [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Task: 1 node training. ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not have T4 gpus for now
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_scp  # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
-def test_huggingface(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.lambda_cloud
-def test_lambda_huggingface(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'lambda_huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {smoke_tests_utils.LAMBDA_TYPE} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_huggingface(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    num_of_gpu_launch = 1
-    test = smoke_tests_utils.Test(
-        'SCP_huggingface_glue_imdb_app',
-        [
-            f'sky launch -y -c {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} {smoke_tests_utils.SCP_TYPE} {smoke_tests_utils.SCP_GPU_V100}:{num_of_gpu_launch} examples/huggingface_glue_imdb_app.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Inferentia. ----------
-@pytest.mark.aws
-def test_inferentia():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test_inferentia',
-        [
-            f'sky launch -y -c {name} -t inf2.xlarge -- echo hi',
-            f'sky exec {name} --gpus Inferentia:1 echo hi',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- TPU. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'tpu_app',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky launch -y -c {name} examples/tpu/tpu_app.yaml | grep "TPU .* already exists"',  # Ensure sky launch won't create another TPU.
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take >20 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- TPU VM. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu_vm():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'tpu_vm_app',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',  # Ensure the cluster is STOPPED.
-            # Use retry: guard against transient errors observed for
-            # just-stopped TPU VMs (#962).
-            f'sky start --retry-until-up -y {name}',
-            f'sky exec {name} examples/tpu/tpuvm_mnist.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- TPU VM Pod. ----------
-@pytest.mark.gcp
-@pytest.mark.tpu
-def test_tpu_vm_pod():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'tpu_pod',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --gpus tpu-v2-32 --use-spot --zone europe-west4-a',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- TPU Pod Slice on GKE. ----------
-@pytest.mark.kubernetes
-def test_tpu_pod_slice_gke():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'tpu_pod_slice_gke',
-        [
-            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
-            f'sky logs {name} 1',  # Ensure the job finished.
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"',  # Ensure TPU is reachable.
-            f'sky logs {name} 2 --status'
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # can take 30 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Simple apps. ----------
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_multi_hostname(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
-    test = smoke_tests_utils.Test(
-        'multi_hostname',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} examples/multi_hostname.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky logs {name} 1 | grep "My hostname:" | wc -l | grep 2',  # Ensure there are 2 hosts.
-            f'sky exec {name} examples/multi_hostname.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=smoke_tests_utils.get_timeout(generic_cloud,
-                                              total_timeout_minutes * 60),
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_multi_node_failure(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multi_node_failure',
-        [
-            # TODO(zhwu): we use multi-thread to run the commands in setup
-            # commands in parallel, which makes it impossible to fail fast
-            # when one of the nodes fails. We should fix this in the future.
-            # The --detach-setup version can fail fast, as the setup is
-            # submitted to the remote machine, which does not use multi-thread.
-            # Refer to the comment in `subprocess_utils.run_in_parallel`.
-            # f'sky launch -y -c {name} --cloud {generic_cloud} tests/test_yamls/failed_worker_setup.yaml && exit 1',  # Ensure the job setup failed.
-            f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup tests/test_yamls/failed_worker_setup.yaml',
-            f'sky logs {name} 1 --status | grep FAILED_SETUP',  # Ensure the job setup failed.
-            f'sky exec {name} tests/test_yamls/failed_worker_run.yaml',
-            f'sky logs {name} 2 --status | grep FAILED',  # Ensure the job failed.
-            f'sky logs {name} 2 | grep "My hostname:" | wc -l | grep 2',  # Ensure there 2 of the hosts printed their hostname.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on GCP. ----------
-@pytest.mark.gcp
-def test_gcp_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud gcp examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on AWS. ----------
-@pytest.mark.aws
-def test_aws_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'aws_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud aws examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Azure. ----------
-@pytest.mark.azure
-def test_azure_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'azure_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud azure examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Kubernetes. ----------
-@pytest.mark.kubernetes
-def test_kubernetes_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'kubernetes_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud kubernetes examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 100); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 5; done; if [ "$success" = false ]; then exit 1; fi'
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on Paperspace. ----------
-@pytest.mark.paperspace
-def test_paperspace_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'paperspace_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud paperspace examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Web apps with custom ports on RunPod. ----------
-@pytest.mark.runpod
-def test_runpod_http_server_with_custom_ports():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'runpod_http_server_with_custom_ports',
-        [
-            f'sky launch -y -d -c {name} --cloud runpod examples/http_server_with_custom_ports/task.yaml',
-            f'until SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}; do sleep 10; done',
-            # Retry a few times to avoid flakiness in ports being open.
-            f'ip=$(SKYPILOT_DEBUG=0 sky status --endpoint 33828 {name}); success=false; for i in $(seq 1 5); do if curl $ip | grep "<h1>This is a demo HTML page.</h1>"; then success=true; break; fi; sleep 10; done; if [ "$success" = false ]; then exit 1; fi',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Labels from task on AWS (instance_tags) ----------
-@pytest.mark.aws
-def test_task_labels_aws():
-    name = smoke_tests_utils.get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='aws', region='us-east-1')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = smoke_tests_utils.Test(
-            'task_labels_aws',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with aws cli that the tags are set.
-                'aws ec2 describe-instances '
-                '--query "Reservations[*].Instances[*].InstanceId" '
-                '--filters "Name=instance-state-name,Values=running" '
-                f'--filters "Name=tag:skypilot-cluster-name,Values={name}*" '
-                '--filters "Name=tag:inlinelabel1,Values=inlinevalue1" '
-                '--filters "Name=tag:inlinelabel2,Values=inlinevalue2" '
-                '--region us-east-1 --output text',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Labels from task on GCP (labels) ----------
-@pytest.mark.gcp
-def test_task_labels_gcp():
-    name = smoke_tests_utils.get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='gcp')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = smoke_tests_utils.Test(
-            'task_labels_gcp',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with gcloud cli that the tags are set
-                f'gcloud compute instances list --filter="name~\'^{name}\' AND '
-                'labels.inlinelabel1=\'inlinevalue1\' AND '
-                'labels.inlinelabel2=\'inlinevalue2\'" '
-                '--format="value(name)" | grep .',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Labels from task on Kubernetes (labels) ----------
-@pytest.mark.kubernetes
-def test_task_labels_kubernetes():
-    name = smoke_tests_utils.get_cluster_name()
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_labels.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(cloud='kubernetes')
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test = smoke_tests_utils.Test(
-            'task_labels_kubernetes',
-            [
-                f'sky launch -y -c {name} {file_path}',
-                # Verify with kubectl that the labels are set.
-                'kubectl get pods '
-                '--selector inlinelabel1=inlinevalue1 '
-                '--selector inlinelabel2=inlinevalue2 '
-                '-o jsonpath=\'{.items[*].metadata.name}\' | '
-                f'grep \'^{name}\''
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Pod Annotations on Kubernetes ----------
-@pytest.mark.kubernetes
-def test_add_pod_annotations_for_autodown_with_launch():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'add_pod_annotations_for_autodown_with_launch',
-        [
-            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
-            # Autodown is set.
-            f'sky launch -y -c {name} -i 10 --down --num-nodes 2 --cpus=1 --cloud kubernetes',
-            # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
-            # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop'
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_add_and_remove_pod_annotations_with_autostop():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'add_and_remove_pod_annotations_with_autostop',
-        [
-            # Launch Kubernetes cluster with two nodes, each being head node and worker node.
-            f'sky launch -y -c {name} --num-nodes 2 --cpus=1 --cloud kubernetes',
-            # Set autodown on the cluster with 'autostop' command.
-            f'sky autostop -y {name} -i 20 --down',
-            # Get names of the pods containing cluster name.
-            f'pod_1=$(kubectl get pods -o name | grep {name} | sed -n 1p)',
-            f'pod_2=$(kubectl get pods -o name | grep {name} | sed -n 2p)',
-            # Describe the first pod and check for annotations.
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check for annotations.
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            'kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Cancel the set autodown to remove the annotations from the pods.
-            f'sky autostop -y {name} --cancel',
-            # Describe the first pod and check if annotations are removed.
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_1 | grep -q skypilot.co/idle_minutes_to_autostop',
-            # Describe the second pod and check if annotations are removed.
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/autodown',
-            '! kubectl describe pod $pod_2 | grep -q skypilot.co/idle_minutes_to_autostop',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Container logs from task on Kubernetes ----------
-@pytest.mark.kubernetes
-def test_container_logs_multinode_kubernetes():
-    name = smoke_tests_utils.get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
-    head_logs = ('kubectl get pods '
-                 f' | grep {name} |  grep head | '
-                 " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    worker_logs = ('kubectl get pods '
-                   f' | grep {name} |  grep worker |'
-                   " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = smoke_tests_utils.Test(
-            'container_logs_multinode_kubernetes',
-            [
-                f'sky launch -y -c {name} {task_yaml} --num-nodes 2',
-                f'{head_logs} | wc -l | grep 9',
-                f'{worker_logs} | wc -l | grep 9',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_container_logs_two_jobs_kubernetes():
-    name = smoke_tests_utils.get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml'
-    pod_logs = ('kubectl get pods '
-                f' | grep {name} |  grep head |'
-                " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = smoke_tests_utils.Test(
-            'test_container_logs_two_jobs_kubernetes',
-            [
-                f'sky launch -y -c {name} {task_yaml}',
-                f'{pod_logs} | wc -l | grep 9',
-                f'sky launch -y -c {name} {task_yaml}',
-                f'{pod_logs} | wc -l | grep 18',
-                f'{pod_logs} | grep 1 | wc -l | grep 2',
-                f'{pod_logs} | grep 2 | wc -l | grep 2',
-                f'{pod_logs} | grep 3 | wc -l | grep 2',
-                f'{pod_logs} | grep 4 | wc -l | grep 2',
-                f'{pod_logs} | grep 5 | wc -l | grep 2',
-                f'{pod_logs} | grep 6 | wc -l | grep 2',
-                f'{pod_logs} | grep 7 | wc -l | grep 2',
-                f'{pod_logs} | grep 8 | wc -l | grep 2',
-                f'{pod_logs} | grep 9 | wc -l | grep 2',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_container_logs_two_simultaneous_jobs_kubernetes():
-    name = smoke_tests_utils.get_cluster_name()
-    task_yaml = 'tests/test_yamls/test_k8s_logs.yaml '
-    pod_logs = ('kubectl get pods '
-                f' | grep {name} |  grep head |'
-                " awk '{print $1}' | xargs -I {} kubectl logs {}")
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        test = smoke_tests_utils.Test(
-            'test_container_logs_two_simultaneous_jobs_kubernetes',
-            [
-                f'sky launch -y -c {name}',
-                f'sky exec -c {name} -d {task_yaml}',
-                f'sky exec -c {name} -d {task_yaml}',
-                'sleep 30',
-                f'{pod_logs} | wc -l | grep 18',
-                f'{pod_logs} | grep 1 | wc -l | grep 2',
-                f'{pod_logs} | grep 2 | wc -l | grep 2',
-                f'{pod_logs} | grep 3 | wc -l | grep 2',
-                f'{pod_logs} | grep 4 | wc -l | grep 2',
-                f'{pod_logs} | grep 5 | wc -l | grep 2',
-                f'{pod_logs} | grep 6 | wc -l | grep 2',
-                f'{pod_logs} | grep 7 | wc -l | grep 2',
-                f'{pod_logs} | grep 8 | wc -l | grep 2',
-                f'{pod_logs} | grep 9 | wc -l | grep 2',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Task: n=2 nodes with setups. ----------
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.skip(
-    reason=
-    'The resnet_distributed_tf_app is flaky, due to it failing to detect GPUs.')
-def test_distributed_tf(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'resnet_distributed_tf_app',
-        [
-            # NOTE: running it twice will hang (sometimes?) - an app-level bug.
-            f'python examples/resnet_distributed_tf_app.py {name} {generic_cloud}',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,  # 25 mins (it takes around ~19 mins)
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing GCP start and stop instances ----------
-@pytest.mark.gcp
-def test_gcp_start_stop():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'gcp-start-stop',
-        [
-            f'sky launch -y -c {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=40),
-            f'sky start -y {name} -i 1',
-            f'sky exec {name} examples/gcp_start_stop.yaml',
-            f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[
-                    sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT
-                ],
-                timeout=200),
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing Azure start and stop instances ----------
-@pytest.mark.azure
-def test_azure_start_stop():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'azure-start-stop',
-        [
-            f'sky launch -y -c {name} examples/azure_start_stop.yaml',
-            f'sky exec {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f'sky start -y {name} -i 1',
-            f'sky exec {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[
-                    sky.ClusterStatus.STOPPED, sky.ClusterStatus.INIT
-                ],
-                timeout=280) +
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # 30 mins
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing Autostopping ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support stopping instances
-@pytest.mark.no_ibm  # FIX(IBM) sporadically fails, as restarted workers stay uninitialized indefinitely
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-@pytest.mark.no_kubernetes  # Kubernetes does not autostop yet
-def test_autostop(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
-    # the VM is stopped.
-    autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    # Launching and starting Azure clusters can take a long time too. e.g., restart
-    # a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
-    total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
-    test = smoke_tests_utils.Test(
-        'autostop',
-        [
-            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} -i 1',
-
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m"',
-
-            # Ensure the cluster is not stopped early.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-
-            # Ensure the cluster is STOPPED.
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout),
-
-            # Ensure the cluster is UP and the autostop setting is reset ('-').
-            f'sky start -y {name}',
-            f'sky status | grep {name} | grep -E "UP\s+-"',
-
-            # Ensure the job succeeded.
-            f'sky exec {name} tests/test_yamls/minimal.yaml',
-            f'sky logs {name} 2 --status',
-
-            # Test restarting the idleness timer via reset:
-            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
-            'sleep 40',  # Almost reached the threshold.
-            f'sky autostop -y {name} -i 1',  # Should restart the timer.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout),
-
-            # Test restarting the idleness timer via exec:
-            f'sky start -y {name}',
-            f'sky status | grep {name} | grep -E "UP\s+-"',
-            f'sky autostop -y {name} -i 1',  # Idleness starts counting.
-            'sleep 45',  # Almost reached the threshold.
-            f'sky exec {name} echo hi',  # Should restart the timer.
-            'sleep 45',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout + smoke_tests_utils.BUMP_UP_SECONDS),
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing Autodowning ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support stopping in SkyPilot implementation
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
-def test_autodown(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
-    # the VM is terminated.
-    autodown_timeout = 900 if generic_cloud == 'azure' else 240
-    total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
-    test = smoke_tests_utils.Test(
-        'autodown',
-        [
-            f'sky launch -y -d -c {name} --num-nodes 2 --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --down -i 1',
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m (down)"',
-            # Ensure the cluster is not terminated early.
-            'sleep 40',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            # Ensure the cluster is terminated.
-            f'sleep {autodown_timeout}',
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
-            f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep "1m (down)"',
-            f'sleep {autodown_timeout}',
-            # Ensure the cluster is terminated.
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --cancel',
-            f'sleep {autodown_timeout}',
-            # Ensure the cluster is still UP.
-            f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=total_timeout_minutes * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.scp
-def test_scp_autodown():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'SCP_autodown',
-        [
-            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --down -i 1',
-            # Ensure autostop is set.
-            f'sky status | grep {name} | grep "1m (down)"',
-            # Ensure the cluster is not terminated early.
-            'sleep 45',
-            f'sky status --refresh | grep {name} | grep UP',
-            # Ensure the cluster is terminated.
-            'sleep 200',
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep UP',  # Ensure the cluster is UP.
-            f'sky exec {name} {smoke_tests_utils.SCP_TYPE} tests/test_yamls/minimal.yaml',
-            f'sky status | grep {name} | grep "1m (down)"',
-            'sleep 200',
-            # Ensure the cluster is terminated.
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
-            f'sky launch -y -d -c {name} {smoke_tests_utils.SCP_TYPE} --down tests/test_yamls/minimal.yaml',
-            f'sky autostop -y {name} --cancel',
-            'sleep 200',
-            # Ensure the cluster is still UP.
-            f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo "$s" | grep {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60):
-    test = smoke_tests_utils.Test(
-        f'{cloud}-cancel-task',
-        [
-            f'sky launch -c {name} examples/resnet_app.yaml --cloud {cloud} -y -d',
-            # Wait the GPU process to start.
-            'sleep 60',
-            f'sky exec {name} "nvidia-smi | grep python"',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky cancel -y {name} 1',
-            'sleep 60',
-            # check if the python job is gone.
-            f'sky exec {name} "! nvidia-smi | grep python"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=timeout,
-    )
-    return test
-
-
-# ---------- Testing `sky cancel` ----------
-@pytest.mark.aws
-def test_cancel_aws():
-    name = smoke_tests_utils.get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'aws')
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_cancel_gcp():
-    name = smoke_tests_utils.get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'gcp')
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_cancel_azure():
-    name = smoke_tests_utils.get_cluster_name()
-    test = _get_cancel_task_with_cloud(name, 'azure', timeout=30 * 60)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support V100 gpus for now
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not have V100 gpus
-@pytest.mark.no_ibm  # IBM cloud currently doesn't provide public image with CUDA
-@pytest.mark.no_paperspace  # Paperspace has `gnome-shell` on nvidia-smi
-@pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
-def test_cancel_pytorch(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'cancel-pytorch',
-        [
-            f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d',
-            # Wait the GPU process to start.
-            'sleep 90',
-            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || '
-            # When run inside container/k8s, nvidia-smi cannot show process ids.
-            # See https://github.com/NVIDIA/nvidia-docker/issues/179
-            # To work around, we check if GPU utilization is greater than 0.
-            f'[ \$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits) -gt 0 ]"',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            f'sky cancel -y {name} 1',
-            'sleep 60',
-            f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep \'No running process\') || '
-            # Ensure Xorg is the only process running.
-            '[ \$(nvidia-smi | grep -A 10 Processes | grep -A 10 === | grep -v Xorg) -eq 2 ]"',
-            f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# can't use `_get_cancel_task_with_cloud()`, as command `nvidia-smi`
-# requires a CUDA public image, which IBM doesn't offer
-@pytest.mark.ibm
-def test_cancel_ibm():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'ibm-cancel-task',
-        [
-            f'sky launch -y -c {name} --cloud ibm examples/minimal.yaml',
-            f'sky exec {name} -n {name}-1 -d  "while true; do echo \'Hello SkyPilot\'; sleep 2; done"',
-            'sleep 20',
-            f'sky queue {name} | grep {name}-1 | grep RUNNING',
-            f'sky cancel -y {name} 2',
-            f'sleep 5',
-            f'sky queue {name} | grep {name}-1 | grep CANCELLED',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing use-spot option ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-def test_use_spot(generic_cloud: str):
-    """Test use-spot and sky exec."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'use-spot',
-        [
-            f'sky launch -c {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml --use-spot -y',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} echo hi',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_stop_gcp_spot():
-    """Test GCP spot can be stopped, autostopped, restarted."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'stop_gcp_spot',
-        [
-            f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile',
-            # stop should go through:
-            f'sky stop {name} -y',
-            f'sky start {name} -y',
-            f'sky exec {name} -- ls myfile',
-            f'sky logs {name} 2 --status',
-            f'sky autostop {name} -i0 -y',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=90),
-            f'sky start {name} -y',
-            f'sky exec {name} -- ls myfile',
-            f'sky logs {name} 3 --status',
-            # -i option at launch should go through:
-            f'sky launch -c {name} -i0 -y',
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=120),
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing managed job ----------
-# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues
-# when the controller being on Azure, which takes a long time for launching
-# step.
-@pytest.mark.managed_jobs
-def test_managed_jobs(generic_cloud: str):
-    """Test the managed jobs yaml."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'managed-jobs',
-        [
-            f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-1',
-                job_status=[
-                    sky.ManagedJobStatus.PENDING,
-                    sky.ManagedJobStatus.SUBMITTED,
-                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
-                ],
-                timeout=60),
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-2',
-                job_status=[
-                    sky.ManagedJobStatus.PENDING,
-                    sky.ManagedJobStatus.SUBMITTED,
-                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
-                ],
-                timeout=60),
-            f'sky jobs cancel -y -n {name}-1',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-1',
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=230),
-            # Test the functionality for logging.
-            f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
-            f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
-        ],
-        # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when
-        # canceling multiple job names is supported.
-        f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_job_pipeline(generic_cloud: str):
-    """Test a job pipeline."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'spot-pipeline',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
-            'sleep 5',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
-            # `grep -A 4 {name}` finds the job with {name} and the 4 lines
-            # after it, i.e. the 4 tasks within the job.
-            # `sed -n 2p` gets the second line of the 4 lines, i.e. the first
-            # task within the job.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
-            f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
-            'sleep 200',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_failed_setup(generic_cloud: str):
-    """Test managed job with failed setup."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'managed_jobs_failed_setup',
-        [
-            f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
-            # Make sure the job failed quickly.
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.FAILED_SETUP],
-                timeout=330 + smoke_tests_utils.BUMP_UP_SECONDS),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  #fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
-    """Test managed job with failed setup for a pipeline."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'managed_jobs_pipeline_failed_setup',
-        [
-            f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.FAILED_SETUP],
-                timeout=600),
-            # Make sure the job failed quickly.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
-            # Task 0 should be SUCCEEDED.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
-            # Task 1 should be FAILED_SETUP.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
-            # Task 2 should be CANCELLED.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            # Task 3 should be CANCELLED.
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing managed job recovery ----------
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_aws(aws_config_region):
-    """Test managed job recovery."""
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = smoke_tests_utils.Test(
-        'managed_jobs_recovery_aws',
-        [
-            f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=600),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the cluster manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_gcp():
-    """Test managed job recovery."""
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-east4-b'
-    query_cmd = (
-        f'gcloud compute instances list --filter='
-        # `:` means prefix match.
-        f'"(labels.ray-cluster-name:{name_on_cloud})" '
-        f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = smoke_tests_utils.Test(
-        'managed_jobs_recovery_gcp',
-        [
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=300),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the cluster manually.
-            terminate_cmd,
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
-    """Test managed job recovery for a pipeline."""
-    name = smoke_tests_utils.get_cluster_name()
-    user_hash = common_utils.get_user_hash()
-    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
-    region = aws_config_region
-    if region != 'us-east-2':
-        pytest.skip('Only run spot pipeline recovery test in us-east-2')
-    test = smoke_tests_utils.Test(
-        'managed_jobs_pipeline_recovery_aws',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
-            # Terminate the cluster manually.
-            # The `cat ...| rev` is to retrieve the job_id from the
-            # SKYPILOT_TASK_ID, which gets the second to last field
-            # separated by `-`.
-            (
-                f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
-                'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
-                f'aws ec2 terminate-instances --region {region} --instance-ids $('
-                f'aws ec2 describe-instances --region {region} '
-                # TODO(zhwu): fix the name for spot cluster.
-                '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}'
-                f'-{user_hash} '
-                f'--query Reservations[].Instances[].InstanceId '
-                '--output text)'),
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
-            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
-            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_pipeline_recovery_gcp():
-    """Test managed job recovery for a pipeline."""
-    name = smoke_tests_utils.get_cluster_name()
-    zone = 'us-east4-b'
-    user_hash = common_utils.get_user_hash()
-    user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
-    query_cmd = (
-        'gcloud compute instances list --filter='
-        f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" '
-        f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = smoke_tests_utils.Test(
-        'managed_jobs_pipeline_recovery_gcp',
-        [
-            f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
-            # Terminate the cluster manually.
-            # The `cat ...| rev` is to retrieve the job_id from the
-            # SKYPILOT_TASK_ID, which gets the second to last field
-            # separated by `-`.
-            (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
-             f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
-            smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=200),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
-            f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
-            f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_default_resources(generic_cloud: str):
-    """Test managed job recovery for default resources."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'managed-spot-recovery-default-resources',
-        [
-            f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[
-                    sky.ManagedJobStatus.RUNNING,
-                    sky.ManagedJobStatus.RECOVERING
-                ],
-                timeout=360),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
-    """Test managed job recovery."""
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = smoke_tests_utils.Test(
-        'managed_jobs_recovery_multi_node_aws',
-        [
-            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=450),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the worker manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* '
-             'Name=tag:ray-node-type,Values=worker '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=560),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_recovery_multi_node_gcp():
-    """Test managed job recovery."""
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-west2-a'
-    # Use ':' to match as the cluster name will contain the suffix with job id
-    query_cmd = (
-        f'gcloud compute instances list --filter='
-        f'"(labels.ray-cluster-name:{name_on_cloud} AND '
-        f'labels.ray-node-type=worker)" --zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = smoke_tests_utils.Test(
-        'managed_jobs_recovery_multi_node_gcp',
-        [
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=400),
-            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            # Terminate the worker manually.
-            terminate_cmd,
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=560),
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        timeout=25 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.managed_jobs
-def test_managed_jobs_cancellation_aws(aws_config_region):
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    name_2_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    region = aws_config_region
-    test = smoke_tests_utils.Test(
-        'managed_jobs_cancellation_aws',
-        [
-            # Test cancellation during spot cluster being launched.
-            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[
-                    sky.ManagedJobStatus.STARTING, sky.ManagedJobStatus.RUNNING
-                ],
-                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
-            ),
-            # Test cancelling the spot cluster during spot job being setup.
-            f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            # The job is set up in the cluster, will shown as RUNNING.
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-2',
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}-2',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-2',
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
-            ),
-            # Test cancellation during spot job is recovering.
-            f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            # The job is running in the cluster, will shown as RUNNING.
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-3',
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # Terminate the cluster manually.
-            (f'aws ec2 terminate-instances --region {region} --instance-ids $('
-             f'aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
-             f'--query Reservations[].Instances[].InstanceId '
-             '--output text)'),
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
-            f'sky jobs cancel -y -n {name}-3',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-3',
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
-            # there can be multiple VM with the same name due to the recovery.
-            (f's=$(aws ec2 describe-instances --region {region} '
-             f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* '
-             f'--query Reservations[].Instances[].State[].Name '
-             '--output text) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "pending|running|stopped|stopping"'
-            ),
-        ],
-        timeout=25 * 60)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-def test_managed_jobs_cancellation_gcp():
-    name = smoke_tests_utils.get_cluster_name()
-    name_3 = f'{name}-3'
-    name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
-    zone = 'us-west3-b'
-    query_state_cmd = (
-        'gcloud compute instances list '
-        f'--filter="(labels.ray-cluster-name:{name_3_on_cloud})" '
-        '--format="value(status)"')
-    query_cmd = (f'gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name:{name_3_on_cloud})" '
-                 f'--zones={zone} --format="value(name)"')
-    terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
-                     f' --quiet $({query_cmd})')
-    test = smoke_tests_utils.Test(
-        'managed_jobs_cancellation_gcp',
-        [
-            # Test cancellation during spot cluster being launched.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.STARTING],
-                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # Test cancelling the spot cluster during spot job being setup.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            # The job is set up in the cluster, will shown as RUNNING.
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-2',
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
-            f'sky jobs cancel -y -n {name}-2',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-2',
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # Test cancellation during spot job is recovering.
-            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-3',
-                job_status=[sky.ManagedJobStatus.RUNNING],
-                timeout=300 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # Terminate the cluster manually.
-            terminate_cmd,
-            smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
-            f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
-            f'sky jobs cancel -y -n {name}-3',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=f'{name}-3',
-                job_status=[sky.ManagedJobStatus.CANCELLED],
-                timeout=120 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
-            # there can be multiple VM with the same name due to the recovery.
-            (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
-            ),
-        ],
-        timeout=25 * 60)
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing storage for managed job ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.managed_jobs
-def test_managed_jobs_storage(generic_cloud: str):
-    """Test storage with managed job"""
-    name = smoke_tests_utils.get_cluster_name()
-    yaml_str = pathlib.Path(
-        'examples/managed_job_with_storage.yaml').read_text()
-    timestamp = int(time.time())
-    storage_name = f'sky-test-{timestamp}'
-    output_storage_name = f'sky-test-output-{timestamp}'
-
-    # Also perform region testing for bucket creation to validate if buckets are
-    # created in the correct region and correctly mounted in managed jobs.
-    # However, we inject this testing only for AWS and GCP since they are the
-    # supported object storage providers in SkyPilot.
-    region_flag = ''
-    region_validation_cmd = 'true'
-    use_spot = ' --use-spot'
-    if generic_cloud == 'aws':
-        region = 'eu-central-1'
-        region_flag = f' --region {region}'
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.S3, bucket_name=storage_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
-        output_check_cmd = f'{s3_check_file_count} | grep 1'
-    elif generic_cloud == 'gcp':
-        region = 'us-west2'
-        region_flag = f' --region {region}'
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.GCS, bucket_name=storage_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
-        output_check_cmd = f'{gcs_check_file_count} | grep 1'
-    elif generic_cloud == 'azure':
-        region = 'westus2'
-        region_flag = f' --region {region}'
-        storage_account_name = (
-            storage_lib.AzureBlobStore.get_default_storage_account_name(region))
-        region_cmd = TestStorageWithCredentials.cli_region_cmd(
-            storage_lib.StoreType.AZURE,
-            storage_account_name=storage_account_name)
-        region_validation_cmd = f'{region_cmd} | grep {region}'
-        az_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.AZURE,
-            output_storage_name,
-            'output.txt',
-            storage_account_name=storage_account_name)
-        output_check_cmd = f'{az_check_file_count} | grep 1'
-    elif generic_cloud == 'kubernetes':
-        # With Kubernetes, we don't know which object storage provider is used.
-        # Check both S3 and GCS if bucket exists in either.
-        s3_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.S3, output_storage_name, 'output.txt')
-        s3_output_check_cmd = f'{s3_check_file_count} | grep 1'
-        gcs_check_file_count = TestStorageWithCredentials.cli_count_name_in_bucket(
-            storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
-        gcs_output_check_cmd = f'{gcs_check_file_count} | grep 1'
-        output_check_cmd = f'{s3_output_check_cmd} || {gcs_output_check_cmd}'
-        use_spot = ' --no-use-spot'
-
-    yaml_str = yaml_str.replace('sky-workdir-zhwu', storage_name)
-    yaml_str = yaml_str.replace('sky-output-bucket', output_storage_name)
-    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(yaml_str)
-        f.flush()
-        file_path = f.name
-        test = smoke_tests_utils.Test(
-            'managed_jobs_storage',
-            [
-                *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-                f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
-                region_validation_cmd,  # Check if the bucket is created in the correct region
-                smoke_tests_utils.
-                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                    job_name=name,
-                    job_status=[sky.ManagedJobStatus.SUCCEEDED],
-                    timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
-                # Wait for the job to be cleaned up.
-                'sleep 20',
-                f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
-                # Check if file was written to the mounted output bucket
-                output_check_cmd
-            ],
-            (f'sky jobs cancel -y -n {name}',
-             f'; sky storage delete {output_storage_name} || true'),
-            # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-            timeout=20 * 60,
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing spot TPU ----------
-@pytest.mark.gcp
-@pytest.mark.managed_jobs
-@pytest.mark.tpu
-def test_managed_jobs_tpu():
-    """Test managed job on TPU."""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-spot-tpu',
-        [
-            f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.STARTING],
-                timeout=60 + smoke_tests_utils.BUMP_UP_SECONDS),
-            # TPU takes a while to launch
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[
-                    sky.ManagedJobStatus.RUNNING, sky.ManagedJobStatus.SUCCEEDED
-                ],
-                timeout=900 + smoke_tests_utils.BUMP_UP_SECONDS),
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing env for managed jobs ----------
-@pytest.mark.managed_jobs
-def test_managed_jobs_inline_env(generic_cloud: str):
-    """Test managed jobs env"""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-managed-jobs-inline-env',
-        [
-            f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\\$TEST_ENV"; ([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            smoke_tests_utils.
-            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
-                job_name=name,
-                job_status=[sky.ManagedJobStatus.SUCCEEDED],
-                timeout=20 + smoke_tests_utils.BUMP_UP_SECONDS),
-            f'JOB_ROW=$(sky jobs queue | grep {name} | head -n1) && '
-            f'echo "$JOB_ROW" && echo "$JOB_ROW" | grep "SUCCEEDED" && '
-            f'JOB_ID=$(echo "$JOB_ROW" | awk \'{{print $1}}\') && '
-            f'echo "JOB_ID=$JOB_ID" && '
-            # Test that logs are still available after the job finishes.
-            'unset SKYPILOT_DEBUG; s=$(sky jobs logs $JOB_ID --refresh) && echo "$s" && echo "$s" | grep "hello world" && '
-            # Make sure we skip the unnecessary logs.
-            'echo "$s" | head -n1 | grep "Waiting for"',
-        ],
-        f'sky jobs cancel -y -n {name}',
-        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing env ----------
-def test_inline_env(generic_cloud: str):
-    """Test env"""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-inline-env',
-        [
-            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            'sleep 20',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-        smoke_tests_utils.get_timeout(generic_cloud),
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing env file ----------
-def test_inline_env_file(generic_cloud: str):
-    """Test env"""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-inline-env-file',
-        [
-            f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            f'sky logs {name} 2 --status',
-        ],
-        f'sky down -y {name}',
-        smoke_tests_utils.get_timeout(generic_cloud),
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing custom image ----------
-@pytest.mark.aws
-def test_aws_custom_image():
-    """Test AWS custom image"""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-aws-custom-image',
-        [
-            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud aws --region us-east-2 --image-id ami-062ddd90fb6f8267a',  # Nvidia image
-            f'sky logs {name} 1 --status',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-@pytest.mark.parametrize(
-    'image_id',
-    [
-        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
-        'docker:ubuntu:18.04',
-        # Test latest image with python 3.11 installed by default.
-        'docker:continuumio/miniconda3:24.1.2-0',
-        # Test python>=3.12 where SkyPilot should automatically create a separate
-        # conda env for runtime with python 3.10.
-        'docker:continuumio/miniconda3:latest',
-    ])
-def test_kubernetes_custom_image(image_id):
-    """Test Kubernetes custom image"""
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'test-kubernetes-custom-image',
-        [
-            f'sky launch -c {name} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1',
-            f'sky logs {name} 1 --status',
-            # Try exec to run again and check if the logs are printed
-            f'sky exec {name} tests/test_yamls/test_custom_image.yaml --cloud kubernetes --image-id {image_id} --region None --gpus T4:1 | grep "Hello 100"',
-            # Make sure ssh is working with custom username
-            f'ssh {name} echo hi | grep hi',
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_start_stop_two_nodes():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'azure-start-stop-two-nodes',
-        [
-            f'sky launch --num-nodes=2 -y -c {name} examples/azure_start_stop.yaml',
-            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky stop -y {name}',
-            f'sky start -y {name} -i 1',
-            f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
-            f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[
-                    sky.ClusterStatus.INIT, sky.ClusterStatus.STOPPED
-                ],
-                timeout=200 + smoke_tests_utils.BUMP_UP_SECONDS) +
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
-        ],
-        f'sky down -y {name}',
-        timeout=30 * 60,  # 30 mins  (it takes around ~23 mins)
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing env for disk tier ----------
-@pytest.mark.aws
-def test_aws_disk_tier():
-
-    def _get_aws_query_command(region, instance_id, field, expected):
-        return (f'aws ec2 describe-volumes --region {region} '
-                f'--filters Name=attachment.instance-id,Values={instance_id} '
-                f'--query Volumes[*].{field} | grep {expected} ; ')
-
-    for disk_tier in list(resources_utils.DiskTier):
-        specs = AWS._get_disk_specs(disk_tier)
-        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.AWS.max_cluster_name_length())
-        region = 'us-east-2'
-        test = smoke_tests_utils.Test(
-            'aws-disk-tier-' + disk_tier.value,
-            [
-                f'sky launch -y -c {name} --cloud aws --region {region} '
-                f'--disk-tier {disk_tier.value} echo "hello sky"',
-                f'id=`aws ec2 describe-instances --region {region} --filters '
-                f'Name=tag:ray-cluster-name,Values={name_on_cloud} --query '
-                f'Reservations[].Instances[].InstanceId --output text`; ' +
-                _get_aws_query_command(region, '$id', 'VolumeType',
-                                       specs['disk_tier']) +
-                ('' if specs['disk_tier']
-                 == 'standard' else _get_aws_query_command(
-                     region, '$id', 'Iops', specs['disk_iops'])) +
-                ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command(
-                    region, '$id', 'Throughput', specs['disk_throughput'])),
-            ],
-            f'sky down -y {name}',
-            timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_disk_tier():
-    for disk_tier in list(resources_utils.DiskTier):
-        disk_types = [GCP._get_disk_type(disk_tier)]
-        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.GCP.max_cluster_name_length())
-        region = 'us-west2'
-        instance_type_options = ['']
-        if disk_tier == resources_utils.DiskTier.BEST:
-            # Ultra disk tier requires n2 instance types to have more than 64 CPUs.
-            # If using default instance type, it will only enable the high disk tier.
-            disk_types = [
-                GCP._get_disk_type(resources_utils.DiskTier.HIGH),
-                GCP._get_disk_type(resources_utils.DiskTier.ULTRA),
-            ]
-            instance_type_options = ['', '--instance-type n2-standard-64']
-        for disk_type, instance_type_option in zip(disk_types,
-                                                   instance_type_options):
-            test = smoke_tests_utils.Test(
-                'gcp-disk-tier-' + disk_tier.value,
-                [
-                    f'sky launch -y -c {name} --cloud gcp --region {region} '
-                    f'--disk-tier {disk_tier.value} {instance_type_option} ',
-                    f'name=`gcloud compute instances list --filter='
-                    f'"labels.ray-cluster-name:{name_on_cloud}" '
-                    '--format="value(name)"`; '
-                    f'gcloud compute disks list --filter="name=$name" '
-                    f'--format="value(type)" | grep {disk_type} '
-                ],
-                f'sky down -y {name}',
-                timeout=6 * 60,  # 6 mins  (it takes around ~3 mins)
-            )
-            smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_disk_tier():
-    for disk_tier in list(resources_utils.DiskTier):
-        if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
-            # Azure does not support high and ultra disk tier.
-            continue
-        type = Azure._get_disk_type(disk_tier)
-        name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
-        name_on_cloud = common_utils.make_cluster_name_on_cloud(
-            name, sky.Azure.max_cluster_name_length())
-        region = 'westus2'
-        test = smoke_tests_utils.Test(
-            'azure-disk-tier-' + disk_tier.value,
-            [
-                f'sky launch -y -c {name} --cloud azure --region {region} '
-                f'--disk-tier {disk_tier.value} echo "hello sky"',
-                f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
-                f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
-                f'--output tsv | grep {type}'
-            ],
-            f'sky down -y {name}',
-            timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-def test_azure_best_tier_failover():
-    type = Azure._get_disk_type(resources_utils.DiskTier.LOW)
-    name = smoke_tests_utils.get_cluster_name()
-    name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, sky.Azure.max_cluster_name_length())
-    region = 'westus2'
-    test = smoke_tests_utils.Test(
-        'azure-best-tier-failover',
-        [
-            f'sky launch -y -c {name} --cloud azure --region {region} '
-            f'--disk-tier best --instance-type Standard_D8_v5 echo "hello sky"',
-            f'az resource list --tag ray-cluster-name={name_on_cloud} --query '
-            f'"[?type==\'Microsoft.Compute/disks\'].sku.name" '
-            f'--output tsv | grep {type}',
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,  # 20 mins  (it takes around ~12 mins)
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ------ Testing Zero Quota Failover ------
-@pytest.mark.aws
-def test_aws_zero_quota_failover():
-
-    name = smoke_tests_utils.get_cluster_name()
-    region = smoke_tests_utils.get_aws_region_for_quota_failover()
-
-    if not region:
-        pytest.xfail(
-            'Unable to test zero quota failover optimization — quotas '
-            'for EC2 P3 instances were found on all AWS regions. Is this '
-            'expected for your account?')
-        return
-
-    test = smoke_tests_utils.Test(
-        'aws-zero-quota-failover',
-        [
-            f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-def test_gcp_zero_quota_failover():
-
-    name = smoke_tests_utils.get_cluster_name()
-    region = smoke_tests_utils.get_gcp_region_for_quota_failover()
-
-    if not region:
-        pytest.xfail(
-            'Unable to test zero quota failover optimization — quotas '
-            'for A100-80GB GPUs were found on all GCP regions. Is this '
-            'expected for your account?')
-        return
-
-    test = smoke_tests_utils.Test(
-        'gcp-zero-quota-failover',
-        [
-            f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-def test_long_setup_run_script(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
-                                     suffix='.yaml') as f:
-        f.write(
-            textwrap.dedent(""" \
-            setup: |
-              echo "start long setup"
-            """))
-        for i in range(1024 * 200):
-            f.write(f'  echo {i}\n')
-        f.write('  echo "end long setup"\n')
-        f.write(
-            textwrap.dedent(""" \
-            run: |
-              echo "run"
-        """))
-        for i in range(1024 * 200):
-            f.write(f'  echo {i}\n')
-        f.write('  echo "end run"\n')
-        f.flush()
-
-        test = smoke_tests_utils.Test(
-            'long-setup-run-script',
-            [
-                f'sky launch -y -c {name} --cloud {generic_cloud} --cpus 2+ {f.name}',
-                f'sky exec {name} "echo hello"',
-                f'sky exec {name} {f.name}',
-                f'sky logs {name} --status 1',
-                f'sky logs {name} --status 2',
-                f'sky logs {name} --status 3',
-            ],
-            f'sky down -y {name}',
-        )
-        smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Testing skyserve ----------
-
-
-def _get_service_name() -> str:
-    """Returns a user-unique service name for each test_skyserve_<name>().
-
-    Must be called from each test_skyserve_<name>().
-    """
-    caller_func_name = inspect.stack()[1][3]
-    test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
-    test_name = test_name.replace('skyserve-', 'ss-')
-    test_name = common_utils.make_cluster_name_on_cloud(test_name, 24)
-    return f'{test_name}-{smoke_tests_utils.test_id}'
-
-
-# We check the output of the skyserve service to see if it is ready. Output of
-# `REPLICAS` is in the form of `1/2` where the first number is the number of
-# ready replicas and the second number is the number of total replicas. We
-# grep such format to ensure that the service is ready, and early exit if any
-# failure detected. In the end we sleep for
-# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have
-# enough time to sync with the controller and get all ready replica IPs.
-_SERVE_WAIT_UNTIL_READY = (
-    '{{ while true; do'
-    '     s=$(sky serve status {name}); echo "$s";'
-    '     echo "$s" | grep -q "{replica_num}/{replica_num}" && break;'
-    '     echo "$s" | grep -q "FAILED" && exit 1;'
-    '     sleep 10;'
-    ' done; }}; echo "Got service status $s";'
-    f'sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + 2};')
-_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}'
-_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag'
-_SERVICE_LAUNCHING_STATUS_REGEX = 'PROVISIONING\|STARTING'
-# Since we don't allow terminate the service if the controller is INIT,
-# which is common for simultaneous pytest, we need to wait until the
-# controller is UP before we can terminate the service.
-# The teardown command has a 10-mins timeout, so we don't need to do
-# the timeout here. See implementation of run_one_test() for details.
-_TEARDOWN_SERVICE = (
-    '(for i in `seq 1 20`; do'
-    '     s=$(sky serve down -y {name});'
-    '     echo "Trying to terminate {name}";'
-    '     echo "$s";'
-    '     echo "$s" | grep -q "scheduled to be terminated\|No service to terminate" && break;'
-    '     sleep 10;'
-    '     [ $i -eq 20 ] && echo "Failed to terminate service {name}";'
-    'done)')
-
-_SERVE_ENDPOINT_WAIT = (
-    'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
-    'endpoint=$(sky serve status --endpoint {name}); '
-    'until ! echo "$endpoint" | grep "Controller is initializing"; '
-    'do echo "Waiting for serve endpoint to be ready..."; '
-    'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; '
-    'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"')
-
-_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); '
-                      'until ! echo "$s" | grep "Controller is initializing."; '
-                      'do echo "Waiting for serve status to be ready..."; '
-                      'sleep 5; s=$(sky serve status {name}); done; echo "$s"')
-
-
-def _get_replica_ip(name: str, replica_id: int) -> str:
-    return (f'ip{replica_id}=$(echo "$s" | '
-            f'awk "{_AWK_ALL_LINES_BELOW_REPLICAS}" | '
-            f'grep -E "{name}\s+{replica_id}" | '
-            f'grep -Eo "{_IP_REGEX}")')
-
-
-def _get_skyserve_http_test(name: str, cloud: str,
-                            timeout_minutes: int) -> smoke_tests_utils.Test:
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-{cloud.replace("_", "-")}',
-        [
-            f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=timeout_minutes * 60,
-    )
-    return test
-
-
-def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool,
-                                                                 str]]) -> str:
-    """Check replicas' status and count in sky serve status
-
-    We will check vCPU=2, as all our tests use vCPU=2.
-
-    Args:
-        name: the name of the service
-        check_tuples: A list of replica property to check. Each tuple is
-            (count, is_spot, status)
-    """
-    check_cmd = ''
-    for check_tuple in check_tuples:
-        count, is_spot, status = check_tuple
-        resource_str = ''
-        if status not in ['PENDING', 'SHUTTING_DOWN'
-                         ] and not status.startswith('FAILED'):
-            spot_str = ''
-            if is_spot:
-                spot_str = '\[Spot\]'
-            resource_str = f'({spot_str}vCPU=2)'
-        check_cmd += (f' echo "$s" | grep "{resource_str}" | '
-                      f'grep "{status}" | wc -l | grep {count} || exit 1;')
-    return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd)
-
-
-def _check_service_version(service_name: str, version: str) -> str:
-    # Grep the lines before 'Service Replicas' and check if the service version
-    # is correct.
-    return (f'echo "$s" | grep -B1000 "Service Replicas" | '
-            f'grep -E "{service_name}\s+{version}" || exit 1; ')
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_gcp_http():
-    """Test skyserve on GCP"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'gcp', 20)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.aws
-@pytest.mark.serve
-def test_skyserve_aws_http():
-    """Test skyserve on AWS"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'aws', 20)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.azure
-@pytest.mark.serve
-def test_skyserve_azure_http():
-    """Test skyserve on Azure"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'azure', 30)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-@pytest.mark.serve
-def test_skyserve_kubernetes_http():
-    """Test skyserve on Kubernetes"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'kubernetes', 30)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.oci
-@pytest.mark.serve
-def test_skyserve_oci_http():
-    """Test skyserve on OCI"""
-    name = _get_service_name()
-    test = _get_skyserve_http_test(name, 'oci', 20)
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support T4 gpus for now
-@pytest.mark.serve
-def test_skyserve_llm(generic_cloud: str):
-    """Test skyserve with real LLM usecase"""
-    name = _get_service_name()
-
-    def generate_llm_test_command(prompt: str, expected_output: str) -> str:
-        prompt = shlex.quote(prompt)
-        expected_output = shlex.quote(expected_output)
-        return (
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'python tests/skyserve/llm/get_response.py --endpoint $endpoint '
-            f'--prompt {prompt} | grep {expected_output}')
-
-    with open('tests/skyserve/llm/prompt_output.json', 'r',
-              encoding='utf-8') as f:
-        prompt2output = json.load(f)
-
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-llm',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            *[
-                generate_llm_test_command(prompt, output)
-                for prompt, output in prompt2output.items()
-            ],
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=40 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_spot_recovery():
-    name = _get_service_name()
-    zone = 'us-central1-a'
-
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-spot-recovery-gcp',
-        [
-            f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.serve
-@pytest.mark.no_kubernetes
-def test_skyserve_base_ondemand_fallback(generic_cloud: str):
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-base-ondemand-fallback',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/spot/base_ondemand_fallback.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(1, True, 'READY'),
-                                            (1, False, 'READY')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-def test_skyserve_dynamic_ondemand_fallback():
-    name = _get_service_name()
-    zone = 'us-central1-a'
-
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-dynamic-ondemand-fallback',
-        [
-            f'sky serve up -n {name} --cloud gcp -y tests/skyserve/spot/dynamic_ondemand_fallback.yaml',
-            f'sleep 40',
-            # 2 on-demand (provisioning) + 2 Spot (provisioning).
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s";'
-            'echo "$s" | grep -q "0/4" || exit 1',
-            # Wait for the provisioning starts
-            f'sleep 40',
-            _check_replica_in_status(name, [
-                (2, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
-                (2, False, _SERVICE_LAUNCHING_STATUS_REGEX + '\|SHUTTING_DOWN')
-            ]),
-
-            # Wait until 2 spot instances are ready.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(2, True, 'READY'),
-                                            (0, False, '')]),
-            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
-            f'sleep 40',
-            # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning).
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
-            'echo "$s" | grep -q "1/3"',
-            _check_replica_in_status(
-                name, [(1, True, 'READY'),
-                       (1, True, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
-
-            # Wait until 2 spot instances are ready.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            _check_replica_in_status(name, [(2, True, 'READY'),
-                                            (0, False, '')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_user_bug_restart(generic_cloud: str):
-    """Tests that we restart the service after user bug."""
-    # TODO(zhwu): this behavior needs some rethinking.
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-user-bug-restart',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/restart/user_bug.yaml',
-            f's=$(sky serve status {name}); echo "$s";'
-            'until echo "$s" | grep -A 100 "Service Replicas" | grep "SHUTTING_DOWN"; '
-            'do echo "Waiting for first service to be SHUTTING DOWN..."; '
-            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; ',
-            f's=$(sky serve status {name}); echo "$s";'
-            'until echo "$s" | grep -A 100 "Service Replicas" | grep "FAILED"; '
-            'do echo "Waiting for first service to be FAILED..."; '
-            f'sleep 5; s=$(sky serve status {name}); echo "$s"; done; echo "$s"; '
-            + _check_replica_in_status(name, [(1, True, 'FAILED')]) +
-            # User bug failure will cause no further scaling.
-            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 1; '
-            f'echo "$s" | grep -B 100 "NO_REPLICA" | grep "0/0"',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/auto_restart.yaml',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, SkyPilot here!"; do sleep 2; done; sleep 2; '
-            + _check_replica_in_status(name, [(1, False, 'READY'),
-                                              (1, False, 'FAILED')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-@pytest.mark.no_kubernetes  # Replicas on k8s may be running on the same node and have the same public IP
-def test_skyserve_load_balancer(generic_cloud: str):
-    """Test skyserve load balancer round-robin policy"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-load-balancer',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; '
-            f'{_get_replica_ip(name, 1)}; '
-            f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; '
-            'python tests/skyserve/load_balancer/test_round_robin.py '
-            '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.gcp
-@pytest.mark.serve
-@pytest.mark.no_kubernetes
-def test_skyserve_auto_restart():
-    """Test skyserve with auto restart"""
-    name = _get_service_name()
-    zone = 'us-central1-a'
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-auto-restart',
-        [
-            # TODO(tian): we can dynamically generate YAML from template to
-            # avoid maintaining too many YAML files
-            f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-            # sleep for 20 seconds (initial delay) to make sure it will
-            # be restarted
-            f'sleep 20',
-            smoke_tests_utils.terminate_gcp_replica(name, zone, 1),
-            # Wait for consecutive failure timeout passed.
-            # If the cluster is not using spot, it won't check the cluster status
-            # on the cloud (since manual shutdown is not a common behavior and such
-            # queries takes a lot of time). Instead, we think continuous 3 min probe
-            # failure is not a temporary problem but indeed a failure.
-            'sleep 180',
-            # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time
-            # that the output of `sky serve status` shows FAILED and this status will
-            # cause _SERVE_WAIT_UNTIL_READY to early quit.
-            '(while true; do'
-            f'    output=$(sky serve status {name});'
-            '     echo "$output" | grep -q "1/1" && break;'
-            '     sleep 10;'
-            f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_cancel(generic_cloud: str):
-    """Test skyserve with cancel"""
-    name = _get_service_name()
-
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-cancel',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 '
-            'tests/skyserve/cancel/send_cancel_request.py '
-            '--endpoint $endpoint | grep "Request was cancelled"',
-            f's=$(sky serve logs {name} 1 --no-follow); '
-            'until ! echo "$s" | grep "Please wait for the controller to be"; '
-            'do echo "Waiting for serve logs"; sleep 10; '
-            f's=$(sky serve logs {name} 1 --no-follow); done; '
-            'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_streaming(generic_cloud: str):
-    """Test skyserve with streaming"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-streaming',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/streaming/streaming.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'python3 tests/skyserve/streaming/send_streaming_request.py '
-            '--endpoint $endpoint | grep "Streaming test passed"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_readiness_timeout_fail(generic_cloud: str):
-    """Test skyserve with large readiness probe latency, expected to fail"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-readiness-timeout-fail',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task.yaml',
-            # None of the readiness probe will pass, so the service will be
-            # terminated after the initial delay.
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            'sleep 60',
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;'
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_large_readiness_timeout(generic_cloud: str):
-    """Test skyserve with customized large readiness timeout"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-large-readiness-timeout',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'request_output=$(curl http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_update(generic_cloud: str):
-    """Test skyserve with update"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-update',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/new.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;'
-            # Make sure the traffic is not mixed
-            'curl http://$endpoint | grep "Hi, new SkyPilot here"',
-            # The latest 2 version should be READY and the older versions should be shutting down
-            (_check_replica_in_status(name, [(2, False, 'READY'),
-                                             (2, False, 'SHUTTING_DOWN')]) +
-             _check_service_version(name, "2")),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_rolling_update(generic_cloud: str):
-    """Test skyserve with rolling update"""
-    name = _get_service_name()
-    single_new_replica = _check_replica_in_status(
-        name, [(2, False, 'READY'), (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-               (1, False, 'SHUTTING_DOWN')])
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-rolling-update',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/new.yaml',
-            # Make sure the traffic is mixed across two versions, the replicas
-            # with even id will sleep 60 seconds before being ready, so we
-            # should be able to get observe the period that the traffic is mixed
-            # across two versions.
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'until curl http://$endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; '
-            # The latest version should have one READY and the one of the older versions should be shutting down
-            f'{single_new_replica} {_check_service_version(name, "1,2")} '
-            # Check the output from the old version, immediately after the
-            # output from the new version appears. This is guaranteed by the
-            # round robin load balancing policy.
-            # TODO(zhwu): we should have a more generalized way for checking the
-            # mixed version of replicas to avoid depending on the specific
-            # round robin load balancing policy.
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_fast_update(generic_cloud: str):
-    """Test skyserve with fast update (Increment version of old replicas)"""
-    name = _get_service_name()
-
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-fast-update',
-        [
-            f'sky serve up -n {name} -y --cloud {generic_cloud} tests/skyserve/update/bump_version_before.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml',
-            # sleep to wait for update to be registered.
-            'sleep 40',
-            # 2 on-deamnd (ready) + 1 on-demand (provisioning).
-            (
-                _check_replica_in_status(
-                    name, [(2, False, 'READY'),
-                           (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) +
-                # Fast update will directly have the latest version ready.
-                _check_service_version(name, "2")),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) +
-            _check_service_version(name, "2"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-            # Test rolling update
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml',
-            # sleep to wait for update to be registered.
-            'sleep 25',
-            # 2 on-deamnd (ready) + 1 on-demand (shutting down).
-            _check_replica_in_status(name, [(2, False, 'READY'),
-                                            (1, False, 'SHUTTING_DOWN')]),
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "3"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl http://$endpoint | grep "Hi, SkyPilot here"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.serve
-def test_skyserve_update_autoscale(generic_cloud: str):
-    """Test skyserve update with autoscale"""
-    name = _get_service_name()
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-update-autoscale',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "1"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/num_min_one.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            # Timeout will be triggered when update fails.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) +
-            _check_service_version(name, "2"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
-            # Rolling Update
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml',
-            # sleep before update is registered.
-            'sleep 20',
-            # Timeout will be triggered when update fails.
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "3"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here!"',
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=30 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Spot instances are note supported by Fluidstack
-@pytest.mark.serve
-@pytest.mark.no_kubernetes  # Spot instances are not supported in Kubernetes
-@pytest.mark.parametrize('mode', ['rolling', 'blue_green'])
-def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
-    """Test skyserve with update that changes autoscaler"""
-    name = f'{_get_service_name()}-{mode}'
-
-    wait_until_no_pending = (
-        f's=$(sky serve status {name}); echo "$s"; '
-        'until ! echo "$s" | grep PENDING; do '
-        '  echo "Waiting for replica to be out of pending..."; '
-        f' sleep 5; s=$(sky serve status {name}); '
-        '  echo "$s"; '
-        'done')
-    four_spot_up_cmd = _check_replica_in_status(name, [(4, True, 'READY')])
-    update_check = [f'until ({four_spot_up_cmd}); do sleep 5; done; sleep 15;']
-    if mode == 'rolling':
-        # Check rolling update, it will terminate one of the old on-demand
-        # instances, once there are 4 spot instance ready.
-        update_check += [
-            _check_replica_in_status(
-                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (1, False, 'SHUTTING_DOWN'), (1, False, 'READY')]) +
-            _check_service_version(name, "1,2"),
-        ]
-    else:
-        # Check blue green update, it will keep both old on-demand instances
-        # running, once there are 4 spot instance ready.
-        update_check += [
-            _check_replica_in_status(
-                name, [(1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (2, False, 'READY')]) +
-            _check_service_version(name, "1"),
-        ]
-    test = smoke_tests_utils.Test(
-        f'test-skyserve-new-autoscaler-update-{mode}',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml',
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
-            _check_service_version(name, "1"),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            's=$(curl http://$endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"',
-            f'sky serve update {name} --cloud {generic_cloud} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml',
-            # Wait for update to be registered
-            f'sleep 90',
-            wait_until_no_pending,
-            _check_replica_in_status(
-                name, [(4, True, _SERVICE_LAUNCHING_STATUS_REGEX + '\|READY'),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX),
-                       (2, False, 'READY')]),
-            *update_check,
-            _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5),
-            f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl http://$endpoint | grep "Hi, SkyPilot here"',
-            _check_replica_in_status(name, [(4, True, 'READY'),
-                                            (1, False, 'READY')]),
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs
-@pytest.mark.no_fluidstack
-@pytest.mark.serve
-def test_skyserve_failures(generic_cloud: str):
-    """Test replica failure statuses"""
-    name = _get_service_name()
-
-    test = smoke_tests_utils.Test(
-        'test-skyserve-failures',
-        [
-            f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/failures/initial_delay.yaml',
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_INITIAL_DELAY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            'sleep 60',
-            f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 2; '
-            # Make sure no new replicas are started for early failure.
-            f'echo "$s" | grep -A 100 "Service Replicas" | grep "{name}" | wc -l | grep 2;',
-            f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/failures/probing.yaml',
-            f's=$(sky serve status {name}); '
-            # Wait for replica to be ready.
-            f'until echo "$s" | grep "READY"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done;',
-            # Wait for replica to change to FAILED_PROBING
-            f's=$(sky serve status {name}); '
-            f'until echo "$s" | grep "FAILED_PROBING"; do '
-            'echo "Waiting for replica to be failed..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done',
-            # Wait for the PENDING replica to appear.
-            'sleep 10',
-            # Wait until the replica is out of PENDING.
-            f's=$(sky serve status {name}); '
-            f'until ! echo "$s" | grep "PENDING" && ! echo "$s" | grep "Please wait for the controller to be ready."; do '
-            'echo "Waiting for replica to be out of pending..."; sleep 5; '
-            f's=$(sky serve status {name}); echo "$s"; done; ' +
-            _check_replica_in_status(
-                name, [(1, False, 'FAILED_PROBING'),
-                       (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]),
-            # TODO(zhwu): add test for FAILED_PROVISION
-        ],
-        _TEARDOWN_SERVICE.format(name=name),
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# TODO(Ziming, Tian): Add tests for autoscaling.
-
-
-# ------- Testing user dependencies --------
-def test_user_dependencies(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'user-dependencies',
-        [
-            f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"',
-            f'sky logs {name} 1 --status',
-            f'sky exec {name} "echo hi"',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-            f'sky exec {name} "echo bye"',
-            f'sky logs {name} 3 --status',
-            f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml',
-            f'sky logs {name} 4 --status',
-            # Launch again to test the default env does not affect SkyPilot
-            # runtime setup
-            f'sky launch -c {name} "python --version 2>&1 | grep \'Python 3.6\' || exit 1"',
-            f'sky logs {name} 5 --status',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ------- Testing the core API --------
-# Most of the core APIs have been tested in the CLI tests.
-# These tests are for testing the return value of the APIs not fully used in CLI.
-
-
-@pytest.mark.gcp
-def test_core_api_sky_launch_exec():
-    name = smoke_tests_utils.get_cluster_name()
-    task = sky.Task(run="whoami")
-    task.set_resources(sky.Resources(cloud=sky.GCP()))
-    job_id, handle = sky.launch(task, cluster_name=name)
-    assert job_id == 1
-    assert handle is not None
-    assert handle.cluster_name == name
-    assert handle.launched_resources.cloud.is_same_cloud(sky.GCP())
-    job_id_exec, handle_exec = sky.exec(task, cluster_name=name)
-    assert job_id_exec == 2
-    assert handle_exec is not None
-    assert handle_exec.cluster_name == name
-    assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP())
-    # For dummy task (i.e. task.run is None), the job won't be submitted.
-    dummy_task = sky.Task()
-    job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name)
-    assert job_id_dummy is None
-    sky.down(name)
-
-
-# The sky launch CLI has some additional checks to make sure the cluster is up/
-# restarted. However, the core API doesn't have these; make sure it still works
-def test_core_api_sky_launch_fast(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)
-    try:
-        task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud))
-        sky.launch(task,
-                   cluster_name=name,
-                   idle_minutes_to_autostop=1,
-                   fast=True)
-        # Sleep to let the cluster autostop
-        smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-            cluster_name=name,
-            cluster_status=[sky.ClusterStatus.STOPPED],
-            timeout=120)
-        # Run it again - should work with fast=True
-        sky.launch(task,
-                   cluster_name=name,
-                   idle_minutes_to_autostop=1,
-                   fast=True)
-    finally:
-        sky.down(name)
-
-
-# ---------- Testing Storage ----------
-class TestStorageWithCredentials:
-    """Storage tests which require credentials and network connection"""
-
-    AWS_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        '192.168.5.4',  # formatted as an IP address
-        'xn--bucket',  # starts with 'xn--' prefix
-        'bucket-s3alias',  # ends with '-s3alias' suffix
-        'bucket--ol-s3',  # ends with '--ol-s3' suffix
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '-abc',  # starts with a hyphen
-        'abc-',  # ends with a hyphen
-    ]
-
-    GCS_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters (without dots)
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1'
-        # More than 63 characters between dots
-        'abc_.def.ghi.jklmnopqrstuvwxyzabcdefghijklmnopqfghijklmnopqrstuvw' * 5,
-        # more than 222 characters (with dots)
-        '192.168.5.4',  # formatted as an IP address
-        'googbucket',  # starts with 'goog' prefix
-        'googlebucket',  # contains 'google'
-        'g00glebucket',  # variant of 'google'
-        'go0glebucket',  # variant of 'google'
-        'g0oglebucket',  # variant of 'google'
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '_abc',  # starts with an underscore
-        'abc_',  # ends with an underscore
-    ]
-
-    AZURE_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        # more than 63 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        'Abcdef',  # contains an uppercase letter
-        '.abc',  # starts with a non-letter(dot)
-        'a--bc',  # contains consecutive hyphens
-    ]
-
-    IBM_INVALID_NAMES = [
-        'ab',  # less than 3 characters
-        'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1',
-        # more than 63 characters
-        'Abcdef',  # contains an uppercase letter
-        'abc def',  # contains a space
-        'abc..def',  # two adjacent periods
-        '192.168.5.4',  # formatted as an IP address
-        'xn--bucket',  # starts with 'xn--' prefix
-        '.abc',  # starts with a dot
-        'abc.',  # ends with a dot
-        '-abc',  # starts with a hyphen
-        'abc-',  # ends with a hyphen
-        'a.-bc',  # contains the sequence '.-'
-        'a-.bc',  # contains the sequence '-.'
-        'a&bc'  # contains special characters
-        'ab^c'  # contains special characters
-    ]
-    GITIGNORE_SYNC_TEST_DIR_STRUCTURE = {
-        'double_asterisk': {
-            'double_asterisk_excluded': None,
-            'double_asterisk_excluded_dir': {
-                'dir_excluded': None,
-            },
-        },
-        'double_asterisk_parent': {
-            'parent': {
-                'also_excluded.txt': None,
-                'child': {
-                    'double_asterisk_parent_child_excluded.txt': None,
-                },
-                'double_asterisk_parent_excluded.txt': None,
-            },
-        },
-        'excluded.log': None,
-        'excluded_dir': {
-            'excluded.txt': None,
-            'nested_excluded': {
-                'excluded': None,
-            },
-        },
-        'exp-1': {
-            'be_excluded': None,
-        },
-        'exp-2': {
-            'be_excluded': None,
-        },
-        'front_slash_excluded': None,
-        'included.log': None,
-        'included.txt': None,
-        'include_dir': {
-            'excluded.log': None,
-            'included.log': None,
-        },
-        'nested_double_asterisk': {
-            'one': {
-                'also_exclude.txt': None,
-            },
-            'two': {
-                'also_exclude.txt': None,
-            },
-        },
-        'nested_wildcard_dir': {
-            'monday': {
-                'also_exclude.txt': None,
-            },
-            'tuesday': {
-                'also_exclude.txt': None,
-            },
-        },
-        'no_slash_excluded': None,
-        'no_slash_tests': {
-            'no_slash_excluded': {
-                'also_excluded.txt': None,
-            },
-        },
-        'question_mark': {
-            'excluded1.txt': None,
-            'excluded@.txt': None,
-        },
-        'square_bracket': {
-            'excluded1.txt': None,
-        },
-        'square_bracket_alpha': {
-            'excludedz.txt': None,
-        },
-        'square_bracket_excla': {
-            'excluded2.txt': None,
-            'excluded@.txt': None,
-        },
-        'square_bracket_single': {
-            'excluded0.txt': None,
-        },
-    }
-
-    @staticmethod
-    def create_dir_structure(base_path, structure):
-        # creates a given file STRUCTURE in BASE_PATH
-        for name, substructure in structure.items():
-            path = os.path.join(base_path, name)
-            if substructure is None:
-                # Create a file
-                open(path, 'a', encoding='utf-8').close()
-            else:
-                # Create a subdirectory
-                os.mkdir(path)
-                TestStorageWithCredentials.create_dir_structure(
-                    path, substructure)
-
-    @staticmethod
-    def cli_delete_cmd(store_type,
-                       bucket_name,
-                       storage_account_name: str = None):
-        if store_type == storage_lib.StoreType.S3:
-            url = f's3://{bucket_name}'
-            return f'aws s3 rb {url} --force'
-        if store_type == storage_lib.StoreType.GCS:
-            url = f'gs://{bucket_name}'
-            gsutil_alias, alias_gen = data_utils.get_gsutil_command()
-            return f'{alias_gen}; {gsutil_alias} rm -r {url}'
-        if store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            storage_account_name = (
-                storage_lib.AzureBlobStore.get_default_storage_account_name(
-                    default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage container delete '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} '
-                    f'--name {bucket_name}')
-        if store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            url = f's3://{bucket_name}'
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2'
-        if store_type == storage_lib.StoreType.IBM:
-            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-                bucket_name, Rclone.RcloneClouds.IBM)
-            return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}'
-
-    @staticmethod
-    def cli_ls_cmd(store_type, bucket_name, suffix=''):
-        if store_type == storage_lib.StoreType.S3:
-            if suffix:
-                url = f's3://{bucket_name}/{suffix}'
-            else:
-                url = f's3://{bucket_name}'
-            return f'aws s3 ls {url}'
-        if store_type == storage_lib.StoreType.GCS:
-            if suffix:
-                url = f'gs://{bucket_name}/{suffix}'
-            else:
-                url = f'gs://{bucket_name}'
-            return f'gsutil ls {url}'
-        if store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            config_storage_account = skypilot_config.get_nested(
-                ('azure', 'storage_account'), None)
-            storage_account_name = config_storage_account if (
-                config_storage_account is not None) else (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            list_cmd = ('az storage blob list '
-                        f'--container-name {bucket_name} '
-                        f'--prefix {shlex.quote(suffix)} '
-                        f'--account-name {storage_account_name} '
-                        f'--account-key {storage_account_key}')
-            return list_cmd
-        if store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            if suffix:
-                url = f's3://{bucket_name}/{suffix}'
-            else:
-                url = f's3://{bucket_name}'
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2'
-        if store_type == storage_lib.StoreType.IBM:
-            bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name(
-                bucket_name, Rclone.RcloneClouds.IBM)
-            return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}'
-
-    @staticmethod
-    def cli_region_cmd(store_type, bucket_name=None, storage_account_name=None):
-        if store_type == storage_lib.StoreType.S3:
-            assert bucket_name is not None
-            return ('aws s3api get-bucket-location '
-                    f'--bucket {bucket_name} --output text')
-        elif store_type == storage_lib.StoreType.GCS:
-            assert bucket_name is not None
-            return (f'gsutil ls -L -b gs://{bucket_name}/ | '
-                    'grep "Location constraint" | '
-                    'awk \'{print tolower($NF)}\'')
-        elif store_type == storage_lib.StoreType.AZURE:
-            # For Azure Blob Storage, the location of the containers are
-            # determined by the location of storage accounts.
-            assert storage_account_name is not None
-            return (f'az storage account show --name {storage_account_name} '
-                    '--query "primaryLocation" --output tsv')
-        else:
-            raise NotImplementedError(f'Region command not implemented for '
-                                      f'{store_type}')
-
-    @staticmethod
-    def cli_count_name_in_bucket(store_type,
-                                 bucket_name,
-                                 file_name,
-                                 suffix='',
-                                 storage_account_name=None):
-        if store_type == storage_lib.StoreType.S3:
-            if suffix:
-                return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
-            else:
-                return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"'
-        elif store_type == storage_lib.StoreType.GCS:
-            if suffix:
-                return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l'
-            else:
-                return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l'
-        elif store_type == storage_lib.StoreType.AZURE:
-            if storage_account_name is None:
-                default_region = 'eastus'
-                storage_account_name = (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage blob list '
-                    f'--container-name {bucket_name} '
-                    f'--prefix {shlex.quote(suffix)} '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} | '
-                    f'grep {file_name} | '
-                    'wc -l')
-        elif store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            if suffix:
-                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
-            else:
-                return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2'
-
-    @staticmethod
-    def cli_count_file_in_bucket(store_type, bucket_name):
-        if store_type == storage_lib.StoreType.S3:
-            return f'aws s3 ls s3://{bucket_name} --recursive | wc -l'
-        elif store_type == storage_lib.StoreType.GCS:
-            return f'gsutil ls -r gs://{bucket_name}/** | wc -l'
-        elif store_type == storage_lib.StoreType.AZURE:
-            default_region = 'eastus'
-            storage_account_name = (
-                storage_lib.AzureBlobStore.get_default_storage_account_name(
-                    default_region))
-            storage_account_key = data_utils.get_az_storage_account_key(
-                storage_account_name)
-            return ('az storage blob list '
-                    f'--container-name {bucket_name} '
-                    f'--account-name {storage_account_name} '
-                    f'--account-key {storage_account_key} | '
-                    'grep \\"name\\": | '
-                    'wc -l')
-        elif store_type == storage_lib.StoreType.R2:
-            endpoint_url = cloudflare.create_endpoint()
-            return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l'
-
-    @pytest.fixture
-    def tmp_source(self, tmp_path):
-        # Creates a temporary directory with a file in it
-        tmp_dir = tmp_path / 'tmp-source'
-        tmp_dir.mkdir()
-        tmp_file = tmp_dir / 'tmp-file'
-        tmp_file.write_text('test')
-        circle_link = tmp_dir / 'circle-link'
-        circle_link.symlink_to(tmp_dir, target_is_directory=True)
-        yield str(tmp_dir)
-
-    @staticmethod
-    def generate_bucket_name():
-        # Creates a temporary bucket name
-        # time.time() returns varying precision on different systems, so we
-        # replace the decimal point and use whatever precision we can get.
-        timestamp = str(time.time()).replace('.', '')
-        return f'sky-test-{timestamp}'
-
-    @pytest.fixture
-    def tmp_bucket_name(self):
-        yield self.generate_bucket_name()
-
-    @staticmethod
-    def yield_storage_object(
-            name: Optional[str] = None,
-            source: Optional[storage_lib.Path] = None,
-            stores: Optional[Dict[storage_lib.StoreType,
-                                  storage_lib.AbstractStore]] = None,
-            persistent: Optional[bool] = True,
-            mode: storage_lib.StorageMode = storage_lib.StorageMode.MOUNT):
-        # Creates a temporary storage object. Stores must be added in the test.
-        storage_obj = storage_lib.Storage(name=name,
-                                          source=source,
-                                          stores=stores,
-                                          persistent=persistent,
-                                          mode=mode)
-        yield storage_obj
-        handle = global_user_state.get_handle_from_storage_name(
-            storage_obj.name)
-        if handle:
-            # If handle exists, delete manually
-            # TODO(romilb): This is potentially risky - if the delete method has
-            #   bugs, this can cause resource leaks. Ideally we should manually
-            #   eject storage from global_user_state and delete the bucket using
-            #   boto3 directly.
-            storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_scratch_storage_obj(self, tmp_bucket_name):
-        # Creates a storage object with no source to create a scratch storage.
-        # Stores must be added in the test.
-        yield from self.yield_storage_object(name=tmp_bucket_name)
-
-    @pytest.fixture
-    def tmp_multiple_scratch_storage_obj(self):
-        # Creates a list of 5 storage objects with no source to create
-        # multiple scratch storages.
-        # Stores for each object in the list must be added in the test.
-        storage_mult_obj = []
-        for _ in range(5):
-            timestamp = str(time.time()).replace('.', '')
-            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}')
-            storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                # If handle exists, delete manually
-                # TODO(romilb): This is potentially risky - if the delete method has
-                # bugs, this can cause resource leaks. Ideally we should manually
-                # eject storage from global_user_state and delete the bucket using
-                # boto3 directly.
-                storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_multiple_custom_source_storage_obj(self):
-        # Creates a list of storage objects with custom source names to
-        # create multiple scratch storages.
-        # Stores for each object in the list must be added in the test.
-        custom_source_names = ['"path With Spaces"', 'path With Spaces']
-        storage_mult_obj = []
-        for name in custom_source_names:
-            src_path = os.path.expanduser(f'~/{name}')
-            pathlib.Path(src_path).expanduser().mkdir(exist_ok=True)
-            timestamp = str(time.time()).replace('.', '')
-            store_obj = storage_lib.Storage(name=f'sky-test-{timestamp}',
-                                            source=src_path)
-            storage_mult_obj.append(store_obj)
-        yield storage_mult_obj
-        for storage_obj in storage_mult_obj:
-            handle = global_user_state.get_handle_from_storage_name(
-                storage_obj.name)
-            if handle:
-                storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_local_storage_obj(self, tmp_bucket_name, tmp_source):
-        # Creates a temporary storage object. Stores must be added in the test.
-        yield from self.yield_storage_object(name=tmp_bucket_name,
-                                             source=tmp_source)
-
-    @pytest.fixture
-    def tmp_local_list_storage_obj(self, tmp_bucket_name, tmp_source):
-        # Creates a temp storage object which uses a list of paths as source.
-        # Stores must be added in the test. After upload, the bucket should
-        # have two files - /tmp-file and /tmp-source/tmp-file
-        list_source = [tmp_source, tmp_source + '/tmp-file']
-        yield from self.yield_storage_object(name=tmp_bucket_name,
-                                             source=list_source)
-
-    @pytest.fixture
-    def tmp_bulk_del_storage_obj(self, tmp_bucket_name):
-        # Creates a temporary storage object for testing bulk deletion.
-        # Stores must be added in the test.
-        with tempfile.TemporaryDirectory() as tmpdir:
-            subprocess.check_output(f'mkdir -p {tmpdir}/folder{{000..255}}',
-                                    shell=True)
-            subprocess.check_output(f'touch {tmpdir}/test{{000..255}}.txt',
-                                    shell=True)
-            subprocess.check_output(
-                f'touch {tmpdir}/folder{{000..255}}/test.txt', shell=True)
-            yield from self.yield_storage_object(name=tmp_bucket_name,
-                                                 source=tmpdir)
-
-    @pytest.fixture
-    def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj):
-        # Creates a copy mount storage which reuses an existing storage object.
-        tmp_scratch_storage_obj.add_store(storage_lib.StoreType.S3)
-        storage_name = tmp_scratch_storage_obj.name
-
-        # Try to initialize another storage with the storage object created
-        # above, but now in COPY mode. This should succeed.
-        yield from self.yield_storage_object(name=storage_name,
-                                             mode=storage_lib.StorageMode.COPY)
-
-    @pytest.fixture
-    def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure):
-        # Creates a temporary storage object for testing .gitignore filter.
-        # GITIGINORE_STRUCTURE is representing a file structure in a dictionary
-        # format. Created storage object will contain the file structure along
-        # with .gitignore and .git/info/exclude files to test exclude filter.
-        # Stores must be added in the test.
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Creates file structure to be uploaded in the Storage
-            self.create_dir_structure(tmpdir, gitignore_structure)
-
-            # Create .gitignore and list files/dirs to be excluded in it
-            skypilot_path = os.path.dirname(os.path.dirname(sky.__file__))
-            temp_path = f'{tmpdir}/.gitignore'
-            file_path = os.path.join(skypilot_path, 'tests/gitignore_test')
-            shutil.copyfile(file_path, temp_path)
-
-            # Create .git/info/exclude and list files/dirs to be excluded in it
-            temp_path = f'{tmpdir}/.git/info/'
-            os.makedirs(temp_path)
-            temp_exclude_path = os.path.join(temp_path, 'exclude')
-            file_path = os.path.join(skypilot_path,
-                                     'tests/git_info_exclude_test')
-            shutil.copyfile(file_path, temp_exclude_path)
-
-            # Create sky Storage with the files created
-            yield from self.yield_storage_object(
-                name=tmp_bucket_name,
-                source=tmpdir,
-                mode=storage_lib.StorageMode.COPY)
-
-    @pytest.fixture
-    def tmp_awscli_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using awscli
-        bucket_uri = f's3://{tmp_bucket_name}'
-        subprocess.check_call(['aws', 's3', 'mb', bucket_uri])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(['aws', 's3', 'rb', bucket_uri, '--force'])
-
-    @pytest.fixture
-    def tmp_gsutil_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using gsutil
-        bucket_uri = f'gs://{tmp_bucket_name}'
-        subprocess.check_call(['gsutil', 'mb', bucket_uri])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(['gsutil', 'rm', '-r', bucket_uri])
-
-    @pytest.fixture
-    def tmp_az_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using gsutil
-        default_region = 'eastus'
-        storage_account_name = (
-            storage_lib.AzureBlobStore.get_default_storage_account_name(
-                default_region))
-        storage_account_key = data_utils.get_az_storage_account_key(
-            storage_account_name)
-        bucket_uri = data_utils.AZURE_CONTAINER_URL.format(
-            storage_account_name=storage_account_name,
-            container_name=tmp_bucket_name)
-        subprocess.check_call([
-            'az', 'storage', 'container', 'create', '--name',
-            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
-            '--account-key', f'{storage_account_key}'
-        ])
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call([
-            'az', 'storage', 'container', 'delete', '--name',
-            f'{tmp_bucket_name}', '--account-name', f'{storage_account_name}',
-            '--account-key', f'{storage_account_key}'
-        ])
-
-    @pytest.fixture
-    def tmp_awscli_bucket_r2(self, tmp_bucket_name):
-        # Creates a temporary bucket using awscli
-        endpoint_url = cloudflare.create_endpoint()
-        bucket_uri = f's3://{tmp_bucket_name}'
-        subprocess.check_call(
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 mb {bucket_uri} --endpoint {endpoint_url} --profile=r2',
-            shell=True)
-        yield tmp_bucket_name, bucket_uri
-        subprocess.check_call(
-            f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {bucket_uri} --force --endpoint {endpoint_url} --profile=r2',
-            shell=True)
-
-    @pytest.fixture
-    def tmp_ibm_cos_bucket(self, tmp_bucket_name):
-        # Creates a temporary bucket using IBM COS API
-        storage_obj = storage_lib.IBMCosStore(source="", name=tmp_bucket_name)
-        yield tmp_bucket_name
-        storage_obj.delete()
-
-    @pytest.fixture
-    def tmp_public_storage_obj(self, request):
-        # Initializes a storage object with a public bucket
-        storage_obj = storage_lib.Storage(source=request.param)
-        yield storage_obj
-        # This does not require any deletion logic because it is a public bucket
-        # and should not get added to global_user_state.
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj,
-                                              store_type):
-        # Creates a new bucket with a local source, uploads files to it
-        # and deletes it.
-        tmp_local_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_local_storage_obj.name in out.decode('utf-8')
-
-        # Run sky storage delete to delete the storage object
-        subprocess.check_output(
-            ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes'])
-
-        # Run sky storage ls to check if storage object is deleted
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_local_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.xdist_group('multiple_bucket_deletion')
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm)
-    ])
-    def test_multiple_buckets_creation_and_deletion(
-            self, tmp_multiple_scratch_storage_obj, store_type):
-        # Creates multiple new buckets(5 buckets) with a local source
-        # and deletes them.
-        storage_obj_name = []
-        for store_obj in tmp_multiple_scratch_storage_obj:
-            store_obj.add_store(store_type)
-            storage_obj_name.append(store_obj.name)
-
-        # Run sky storage ls to check if all storage objects exists in the
-        # output filtered by store type
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item in out for item in storage_obj_name])
-
-        # Run sky storage delete all to delete all storage objects
-        delete_cmd = ['sky', 'storage', 'delete', '--yes']
-        delete_cmd += storage_obj_name
-        subprocess.check_output(delete_cmd)
-
-        # Run sky storage ls to check if all storage objects filtered by store
-        # type are deleted
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item not in out for item in storage_obj_name])
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_upload_source_with_spaces(self, store_type,
-                                       tmp_multiple_custom_source_storage_obj):
-        # Creates two buckets with specified local sources
-        # with spaces in the name
-        storage_obj_names = []
-        for storage_obj in tmp_multiple_custom_source_storage_obj:
-            storage_obj.add_store(store_type)
-            storage_obj_names.append(storage_obj.name)
-
-        # Run sky storage ls to check if all storage objects exists in the
-        # output filtered by store type
-        out_all = subprocess.check_output(['sky', 'storage', 'ls'])
-        out = [
-            item.split()[0]
-            for item in out_all.decode('utf-8').splitlines()
-            if store_type.value in item
-        ]
-        assert all([item in out for item in storage_obj_names])
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_bucket_external_deletion(self, tmp_scratch_storage_obj,
-                                      store_type):
-        # Creates a bucket, deletes it externally using cloud cli commands
-        # and then tries to delete it using sky storage delete.
-        tmp_scratch_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_scratch_storage_obj.name in out.decode('utf-8')
-
-        # Delete bucket externally
-        cmd = self.cli_delete_cmd(store_type, tmp_scratch_storage_obj.name)
-        subprocess.check_output(cmd, shell=True)
-
-        # Run sky storage delete to delete the storage object
-        out = subprocess.check_output(
-            ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes'])
-        # Make sure bucket was not created during deletion (see issue #1322)
-        assert 'created' not in out.decode('utf-8').lower()
-
-        # Run sky storage ls to check if storage object is deleted
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_scratch_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj):
-        # Creates a temp folder with over 256 files and folders, upload
-        # files and folders to a new bucket, then delete bucket.
-        tmp_bulk_del_storage_obj.add_store(store_type)
-
-        subprocess.check_output([
-            'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes'
-        ])
-
-        output = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'tmp_public_storage_obj, store_type',
-        [('s3://tcga-2-open', storage_lib.StoreType.S3),
-         ('s3://digitalcorpora', storage_lib.StoreType.S3),
-         ('gs://gcp-public-data-sentinel-2', storage_lib.StoreType.GCS),
-         pytest.param(
-             'https://azureopendatastorage.blob.core.windows.net/nyctlc',
-             storage_lib.StoreType.AZURE,
-             marks=pytest.mark.azure)],
-        indirect=['tmp_public_storage_obj'])
-    def test_public_bucket(self, tmp_public_storage_obj, store_type):
-        # Creates a new bucket with a public source and verifies that it is not
-        # added to global_user_state.
-        tmp_public_storage_obj.add_store(store_type)
-
-        # Run sky storage ls to check if storage object exists in the output
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert tmp_public_storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'nonexist_bucket_url',
-        [
-            's3://{random_name}',
-            'gs://{random_name}',
-            pytest.param(
-                'https://{account_name}.blob.core.windows.net/{random_name}',  # pylint: disable=line-too-long
-                marks=pytest.mark.azure),
-            pytest.param('cos://us-east/{random_name}', marks=pytest.mark.ibm),
-            pytest.param('r2://{random_name}', marks=pytest.mark.cloudflare)
-        ])
-    def test_nonexistent_bucket(self, nonexist_bucket_url):
-        # Attempts to create fetch a stroage with a non-existent source.
-        # Generate a random bucket name and verify it doesn't exist:
-        retry_count = 0
-        while True:
-            nonexist_bucket_name = str(uuid.uuid4())
-            if nonexist_bucket_url.startswith('s3'):
-                command = f'aws s3api head-bucket --bucket {nonexist_bucket_name}'
-                expected_output = '404'
-            elif nonexist_bucket_url.startswith('gs'):
-                command = f'gsutil ls {nonexist_bucket_url.format(random_name=nonexist_bucket_name)}'
-                expected_output = 'BucketNotFoundException'
-            elif nonexist_bucket_url.startswith('https'):
-                default_region = 'eastus'
-                storage_account_name = (
-                    storage_lib.AzureBlobStore.get_default_storage_account_name(
-                        default_region))
-                storage_account_key = data_utils.get_az_storage_account_key(
-                    storage_account_name)
-                command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}'
-                expected_output = '"exists": false'
-            elif nonexist_bucket_url.startswith('r2'):
-                endpoint_url = cloudflare.create_endpoint()
-                command = f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api head-bucket --bucket {nonexist_bucket_name} --endpoint {endpoint_url} --profile=r2'
-                expected_output = '404'
-            elif nonexist_bucket_url.startswith('cos'):
-                # Using API calls, since using rclone requires a profile's name
-                try:
-                    expected_output = command = "echo"  # avoid unrelated exception in case of failure.
-                    bucket_name = urllib.parse.urlsplit(
-                        nonexist_bucket_url.format(
-                            random_name=nonexist_bucket_name)).path.strip('/')
-                    client = ibm.get_cos_client('us-east')
-                    client.head_bucket(Bucket=bucket_name)
-                except ibm.ibm_botocore.exceptions.ClientError as e:
-                    if e.response['Error']['Code'] == '404':
-                        # success
-                        return
-            else:
-                raise ValueError('Unsupported bucket type '
-                                 f'{nonexist_bucket_url}')
-
-            # Check if bucket exists using the cli:
-            try:
-                out = subprocess.check_output(command,
-                                              stderr=subprocess.STDOUT,
-                                              shell=True)
-            except subprocess.CalledProcessError as e:
-                out = e.output
-            out = out.decode('utf-8')
-            if expected_output in out:
-                break
-            else:
-                retry_count += 1
-                if retry_count > 3:
-                    raise RuntimeError('Unable to find a nonexistent bucket '
-                                       'to use. This is higly unlikely - '
-                                       'check if the tests are correct.')
-
-        with pytest.raises(sky.exceptions.StorageBucketGetError,
-                           match='Attempted to use a non-existent'):
-            if nonexist_bucket_url.startswith('https'):
-                storage_obj = storage_lib.Storage(
-                    source=nonexist_bucket_url.format(
-                        account_name=storage_account_name,
-                        random_name=nonexist_bucket_name))
-            else:
-                storage_obj = storage_lib.Storage(
-                    source=nonexist_bucket_url.format(
-                        random_name=nonexist_bucket_name))
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'private_bucket',
-        [
-            f's3://imagenet',
-            f'gs://imagenet',
-            pytest.param('https://smoketestprivate.blob.core.windows.net/test',
-                         marks=pytest.mark.azure),  # pylint: disable=line-too-long
-            pytest.param('cos://us-east/bucket1', marks=pytest.mark.ibm)
-        ])
-    def test_private_bucket(self, private_bucket):
-        # Attempts to access private buckets not belonging to the user.
-        # These buckets are known to be private, but may need to be updated if
-        # they are removed by their owners.
-        store_type = urllib.parse.urlsplit(private_bucket).scheme
-        if store_type == 'https' or store_type == 'cos':
-            private_bucket_name = urllib.parse.urlsplit(
-                private_bucket).path.strip('/')
-        else:
-            private_bucket_name = urllib.parse.urlsplit(private_bucket).netloc
-        with pytest.raises(
-                sky.exceptions.StorageBucketGetError,
-                match=storage_lib._BUCKET_FAIL_TO_CONNECT_MESSAGE.format(
-                    name=private_bucket_name)):
-            storage_obj = storage_lib.Storage(source=private_bucket)
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
-                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
-                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
-                              pytest.param('tmp_az_bucket',
-                                           storage_lib.StoreType.AZURE,
-                                           marks=pytest.mark.azure),
-                              pytest.param('tmp_ibm_cos_bucket',
-                                           storage_lib.StoreType.IBM,
-                                           marks=pytest.mark.ibm),
-                              pytest.param('tmp_awscli_bucket_r2',
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_upload_to_existing_bucket(self, ext_bucket_fixture, request,
-                                       tmp_source, store_type):
-        # Tries uploading existing files to newly created bucket (outside of
-        # sky) and verifies that files are written.
-        bucket_name, _ = request.getfixturevalue(ext_bucket_fixture)
-        storage_obj = storage_lib.Storage(name=bucket_name, source=tmp_source)
-        storage_obj.add_store(store_type)
-
-        # Check if tmp_source/tmp-file exists in the bucket using aws cli
-        out = subprocess.check_output(self.cli_ls_cmd(store_type, bucket_name),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-        # Check symlinks - symlinks don't get copied by sky storage
-        assert (pathlib.Path(tmp_source) / 'circle-link').is_symlink(), (
-            'circle-link was not found in the upload source - '
-            'are the test fixtures correct?')
-        assert 'circle-link' not in out.decode('utf-8'), (
-            'Symlink found in bucket - ls output was : {}'.format(
-                out.decode('utf-8')))
-
-        # Run sky storage ls to check if storage object exists in the output.
-        # It should not exist because the bucket was created externally.
-        out = subprocess.check_output(['sky', 'storage', 'ls'])
-        assert storage_obj.name not in out.decode('utf-8')
-
-    @pytest.mark.no_fluidstack
-    def test_copy_mount_existing_storage(self,
-                                         tmp_copy_mnt_existing_storage_obj):
-        # Creates a bucket with no source in MOUNT mode (empty bucket), and
-        # then tries to load the same storage in COPY mode.
-        tmp_copy_mnt_existing_storage_obj.add_store(storage_lib.StoreType.S3)
-        storage_name = tmp_copy_mnt_existing_storage_obj.name
-
-        # Check `sky storage ls` to ensure storage object exists
-        out = subprocess.check_output(['sky', 'storage', 'ls']).decode('utf-8')
-        assert storage_name in out, f'Storage {storage_name} not found in sky storage ls.'
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('store_type', [
-        storage_lib.StoreType.S3, storage_lib.StoreType.GCS,
-        pytest.param(storage_lib.StoreType.AZURE, marks=pytest.mark.azure),
-        pytest.param(storage_lib.StoreType.IBM, marks=pytest.mark.ibm),
-        pytest.param(storage_lib.StoreType.R2, marks=pytest.mark.cloudflare)
-    ])
-    def test_list_source(self, tmp_local_list_storage_obj, store_type):
-        # Uses a list in the source field to specify a file and a directory to
-        # be uploaded to the storage object.
-        tmp_local_list_storage_obj.add_store(store_type)
-
-        # Check if tmp-file exists in the bucket root using cli
-        out = subprocess.check_output(self.cli_ls_cmd(
-            store_type, tmp_local_list_storage_obj.name),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-        # Check if tmp-file exists in the bucket/tmp-source using cli
-        out = subprocess.check_output(self.cli_ls_cmd(
-            store_type, tmp_local_list_storage_obj.name, 'tmp-source/'),
-                                      shell=True)
-        assert 'tmp-file' in out.decode('utf-8'), \
-            'File not found in bucket - output was : {}'.format(out.decode
-                                                                ('utf-8'))
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('invalid_name_list, store_type',
-                             [(AWS_INVALID_NAMES, storage_lib.StoreType.S3),
-                              (GCS_INVALID_NAMES, storage_lib.StoreType.GCS),
-                              pytest.param(AZURE_INVALID_NAMES,
-                                           storage_lib.StoreType.AZURE,
-                                           marks=pytest.mark.azure),
-                              pytest.param(IBM_INVALID_NAMES,
-                                           storage_lib.StoreType.IBM,
-                                           marks=pytest.mark.ibm),
-                              pytest.param(AWS_INVALID_NAMES,
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_invalid_names(self, invalid_name_list, store_type):
-        # Uses a list in the source field to specify a file and a directory to
-        # be uploaded to the storage object.
-        for name in invalid_name_list:
-            with pytest.raises(sky.exceptions.StorageNameError):
-                storage_obj = storage_lib.Storage(name=name)
-                storage_obj.add_store(store_type)
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize(
-        'gitignore_structure, store_type',
-        [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3),
-         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS),
-         (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.AZURE),
-         pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE,
-                      storage_lib.StoreType.R2,
-                      marks=pytest.mark.cloudflare)])
-    def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure,
-                                                     store_type,
-                                                     tmp_gitignore_storage_obj):
-        # tests if files included in .gitignore and .git/info/exclude are
-        # excluded from being transferred to Storage
-
-        tmp_gitignore_storage_obj.add_store(store_type)
-
-        upload_file_name = 'included'
-        # Count the number of files with the given file name
-        up_cmd = self.cli_count_name_in_bucket(store_type, \
-            tmp_gitignore_storage_obj.name, file_name=upload_file_name)
-        git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \
-            tmp_gitignore_storage_obj.name, file_name='.git')
-        cnt_num_file_cmd = self.cli_count_file_in_bucket(
-            store_type, tmp_gitignore_storage_obj.name)
-
-        up_output = subprocess.check_output(up_cmd, shell=True)
-        git_exclude_output = subprocess.check_output(git_exclude_cmd,
-                                                     shell=True)
-        cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True)
-
-        assert '3' in up_output.decode('utf-8'), \
-                'Files to be included are not completely uploaded.'
-        # 1 is read as .gitignore is uploaded
-        assert '1' in git_exclude_output.decode('utf-8'), \
-               '.git directory should not be uploaded.'
-        # 4 files include .gitignore, included.log, included.txt, include_dir/included.log
-        assert '4' in cnt_output.decode('utf-8'), \
-               'Some items listed in .gitignore and .git/info/exclude are not excluded.'
-
-    @pytest.mark.parametrize('ext_bucket_fixture, store_type',
-                             [('tmp_awscli_bucket', storage_lib.StoreType.S3),
-                              ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),
-                              pytest.param('tmp_awscli_bucket_r2',
-                                           storage_lib.StoreType.R2,
-                                           marks=pytest.mark.cloudflare)])
-    def test_externally_created_bucket_mount_without_source(
-            self, ext_bucket_fixture, request, store_type):
-        # Non-sky managed buckets(buckets created outside of Skypilot CLI)
-        # are allowed to be MOUNTed by specifying the URI of the bucket to
-        # source field only. When it is attempted by specifying the name of
-        # the bucket only, it should error out.
-        #
-        # TODO(doyoung): Add test for IBM COS. Currently, this is blocked
-        # as rclone used to interact with IBM COS does not support feature to
-        # create a bucket, and the ibmcloud CLI is not supported in Skypilot.
-        # Either of the feature is necessary to simulate an external bucket
-        # creation for IBM COS.
-        # https://github.com/skypilot-org/skypilot/pull/1966/files#r1253439837
-
-        ext_bucket_name, ext_bucket_uri = request.getfixturevalue(
-            ext_bucket_fixture)
-        # invalid spec
-        with pytest.raises(sky.exceptions.StorageSpecError) as e:
-            storage_obj = storage_lib.Storage(
-                name=ext_bucket_name, mode=storage_lib.StorageMode.MOUNT)
-            storage_obj.add_store(store_type)
-
-        assert 'Attempted to mount a non-sky managed bucket' in str(e)
-
-        # valid spec
-        storage_obj = storage_lib.Storage(source=ext_bucket_uri,
-                                          mode=storage_lib.StorageMode.MOUNT)
-        handle = global_user_state.get_handle_from_storage_name(
-            storage_obj.name)
-        if handle:
-            storage_obj.delete()
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('region', [
-        'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1',
-        'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'eu-north-1',
-        'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-east-1',
-        'us-east-2', 'us-west-1', 'us-west-2'
-    ])
-    def test_aws_regions(self, tmp_local_storage_obj, region):
-        # This tests creation and upload to bucket in all AWS s3 regions
-        # To test full functionality, use test_managed_jobs_storage above.
-        store_type = storage_lib.StoreType.S3
-        tmp_local_storage_obj.add_store(store_type, region=region)
-        bucket_name = tmp_local_storage_obj.name
-
-        # Confirm that the bucket was created in the correct region
-        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
-        out = subprocess.check_output(region_cmd, shell=True)
-        output = out.decode('utf-8')
-        expected_output_region = region
-        if region == 'us-east-1':
-            expected_output_region = 'None'  # us-east-1 is the default region
-        assert expected_output_region in out.decode('utf-8'), (
-            f'Bucket was not found in region {region} - '
-            f'output of {region_cmd} was: {output}')
-
-        # Check if tmp_source/tmp-file exists in the bucket using cli
-        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
-        out = subprocess.check_output(ls_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert 'tmp-file' in output, (
-            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
-
-    @pytest.mark.no_fluidstack
-    @pytest.mark.parametrize('region', [
-        'northamerica-northeast1', 'northamerica-northeast2', 'us-central1',
-        'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2',
-        'us-west3', 'us-west4', 'southamerica-east1', 'southamerica-west1',
-        'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1',
-        'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6',
-        'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12',
-        'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2',
-        'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2',
-        'asia-southeast2', 'me-central1', 'me-central2', 'me-west1',
-        'australia-southeast1', 'australia-southeast2', 'africa-south1'
-    ])
-    def test_gcs_regions(self, tmp_local_storage_obj, region):
-        # This tests creation and upload to bucket in all GCS regions
-        # To test full functionality, use test_managed_jobs_storage above.
-        store_type = storage_lib.StoreType.GCS
-        tmp_local_storage_obj.add_store(store_type, region=region)
-        bucket_name = tmp_local_storage_obj.name
-
-        # Confirm that the bucket was created in the correct region
-        region_cmd = self.cli_region_cmd(store_type, bucket_name=bucket_name)
-        out = subprocess.check_output(region_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert region in out.decode('utf-8'), (
-            f'Bucket was not found in region {region} - '
-            f'output of {region_cmd} was: {output}')
-
-        # Check if tmp_source/tmp-file exists in the bucket using cli
-        ls_cmd = self.cli_ls_cmd(store_type, bucket_name)
-        out = subprocess.check_output(ls_cmd, shell=True)
-        output = out.decode('utf-8')
-        assert 'tmp-file' in output, (
-            f'tmp-file not found in bucket - output of {ls_cmd} was: {output}')
-
-
-# ---------- Testing YAML Specs ----------
-# Our sky storage requires credentials to check the bucket existance when
-# loading a task from the yaml file, so we cannot make it a unit test.
-class TestYamlSpecs:
-    # TODO(zhwu): Add test for `to_yaml_config` for the Storage object.
-    #  We should not use `examples/storage_demo.yaml` here, since it requires
-    #  users to ensure bucket names to not exist and/or be unique.
-    _TEST_YAML_PATHS = [
-        'examples/minimal.yaml', 'examples/managed_job.yaml',
-        'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml',
-        'examples/multi_hostname.yaml'
-    ]
-
-    def _is_dict_subset(self, d1, d2):
-        """Check if d1 is the subset of d2."""
-        for k, v in d1.items():
-            if k not in d2:
-                if isinstance(v, list) or isinstance(v, dict):
-                    assert len(v) == 0, (k, v)
-                else:
-                    assert False, (k, v)
-            elif isinstance(v, dict):
-                assert isinstance(d2[k], dict), (k, v, d2)
-                self._is_dict_subset(v, d2[k])
-            elif isinstance(v, str):
-                if k == 'accelerators':
-                    resources = sky.Resources()
-                    resources._set_accelerators(v, None)
-                    assert resources.accelerators == d2[k], (k, v, d2)
-                else:
-                    assert v.lower() == d2[k].lower(), (k, v, d2[k])
-            else:
-                assert v == d2[k], (k, v, d2[k])
-
-    def _check_equivalent(self, yaml_path):
-        """Check if the yaml is equivalent after load and dump again."""
-        origin_task_config = common_utils.read_yaml(yaml_path)
-
-        task = sky.Task.from_yaml(yaml_path)
-        new_task_config = task.to_yaml_config()
-        # d1 <= d2
-        print(origin_task_config, new_task_config)
-        self._is_dict_subset(origin_task_config, new_task_config)
-
-    def test_load_dump_yaml_config_equivalent(self):
-        """Test if the yaml config is equivalent after load and dump again."""
-        pathlib.Path('~/datasets').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/tmpfile').expanduser().touch()
-        pathlib.Path('~/.ssh').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/.ssh/id_rsa.pub').expanduser().touch()
-        pathlib.Path('~/tmp-workdir').expanduser().mkdir(exist_ok=True)
-        pathlib.Path('~/Downloads/tpu').expanduser().mkdir(parents=True,
-                                                           exist_ok=True)
-        for yaml_path in self._TEST_YAML_PATHS:
-            self._check_equivalent(yaml_path)
-
-
-# ---------- Testing Multiple Accelerators ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not support K80 gpus for now
-@pytest.mark.no_paperspace  # Paperspace does not support K80 gpus
-def test_multiple_accelerators_ordered():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multiple-accelerators-ordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-        timeout=20 * 60,
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_ordered_with_default():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multiple-accelerators-ordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered_with_default.yaml | grep "Using user-specified accelerators list"',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status {name} | grep Spot',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_unordered():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multiple-accelerators-unordered',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Fluidstack has low availability for T4 GPUs
-@pytest.mark.no_paperspace  # Paperspace does not support T4 GPUs
-def test_multiple_accelerators_unordered_with_default():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multiple-accelerators-unordered-with-default',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered_with_default.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky status {name} | grep Spot',
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
-def test_multiple_resources():
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'multiple-resources',
-        [
-            f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml',
-            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-        ],
-        f'sky down -y {name}',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-# ---------- Sky Benchmark ----------
-@pytest.mark.no_fluidstack  # Requires other clouds to be enabled
-@pytest.mark.no_paperspace  # Requires other clouds to be enabled
-@pytest.mark.no_kubernetes
-@pytest.mark.aws  # SkyBenchmark requires S3 access
-def test_sky_bench(generic_cloud: str):
-    name = smoke_tests_utils.get_cluster_name()
-    test = smoke_tests_utils.Test(
-        'sky-bench',
-        [
-            f'sky bench launch -y -b {name} --cloud {generic_cloud} -i0 tests/test_yamls/minimal.yaml',
-            'sleep 120',
-            f'sky bench show {name} | grep sky-bench-{name} | grep FINISHED',
-        ],
-        f'sky bench down {name} -y; sky bench delete {name} -y',
-    )
-    smoke_tests_utils.run_one_test(test)
-
-
-@pytest.mark.kubernetes
-def test_kubernetes_context_failover():
-    """Test if the kubernetes context failover works.
-
-    This test requires two kubernetes clusters:
-    - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs.
-    - another accessible cluster: with enough CPUs
-    To start the first cluster, run:
-      sky local up
-      # Add mock label for accelerator
-      kubectl label node --overwrite skypilot-control-plane skypilot.co/accelerator=h100 --context kind-skypilot
-      # Get the token for the cluster in context kind-skypilot
-      TOKEN=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.users[0].user.token}\')
-      # Get the API URL for the cluster in context kind-skypilot
-      API_URL=$(kubectl config view --minify --context kind-skypilot -o jsonpath=\'{.clusters[0].cluster.server}\')
-      # Add mock capacity for GPU
-      curl --header "Content-Type: application/json-patch+json" --header "Authorization: Bearer $TOKEN" --request PATCH --data \'[{"op": "add", "path": "/status/capacity/nvidia.com~1gpu", "value": "8"}]\' "$API_URL/api/v1/nodes/skypilot-control-plane/status"
-      # Add a new namespace to test the handling of namespaces
-      kubectl create namespace test-namespace --context kind-skypilot
-      # Set the namespace to test-namespace
-      kubectl config set-context kind-skypilot --namespace=test-namespace --context kind-skypilot
-    """
-    # Get context that is not kind-skypilot
-    contexts = subprocess.check_output('kubectl config get-contexts -o name',
-                                       shell=True).decode('utf-8').split('\n')
-    context = [context for context in contexts if context != 'kind-skypilot'][0]
-    config = textwrap.dedent(f"""\
-    kubernetes:
-      allowed_contexts:
-        - kind-skypilot
-        - {context}
-    """)
-    with tempfile.NamedTemporaryFile(delete=True) as f:
-        f.write(config.encode('utf-8'))
-        f.flush()
-        name = smoke_tests_utils.get_cluster_name()
-        test = smoke_tests_utils.Test(
-            'kubernetes-context-failover',
-            [
-                # Check if kind-skypilot is provisioned with H100 annotations already
-                'NODE_INFO=$(kubectl get nodes -o yaml --context kind-skypilot) && '
-                'echo "$NODE_INFO" | grep nvidia.com/gpu | grep 8 && '
-                'echo "$NODE_INFO" | grep skypilot.co/accelerator | grep h100 || '
-                '{ echo "kind-skypilot does not exist '
-                'or does not have mock labels for GPUs. Check the instructions in '
-                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
-                # Check namespace for kind-skypilot is test-namespace
-                'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
-                '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
-                'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
-                'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"',
-                # Get contexts and set current context to the other cluster that is not kind-skypilot
-                f'kubectl config use-context {context}',
-                # H100 should not in the current context
-                '! sky show-gpus --cloud kubernetes | grep H100',
-                f'sky launch -y -c {name}-1 --cpus 1 echo hi',
-                f'sky logs {name}-1 --status',
-                # It should be launched not on kind-skypilot
-                f'sky status -a {name}-1 | grep "{context}"',
-                # Test failure for launching H100 on other cluster
-                f'sky launch -y -c {name}-2 --gpus H100 --cpus 1 --cloud kubernetes --region {context} echo hi && exit 1 || true',
-                # Test failover
-                f'sky launch -y -c {name}-3 --gpus H100 --cpus 1 --cloud kubernetes echo hi',
-                f'sky logs {name}-3 --status',
-                # Test pods
-                f'kubectl get pods --context kind-skypilot | grep "{name}-3"',
-                # It should be launched on kind-skypilot
-                f'sky status -a {name}-3 | grep "kind-skypilot"',
-                # Should be 7 free GPUs
-                f'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "  7"',
-                # Remove the line with "kind-skypilot"
-                f'sed -i "/kind-skypilot/d" {f.name}',
-                # Should still be able to exec and launch on existing cluster
-                f'sky exec {name}-3 "echo hi"',
-                f'sky logs {name}-3 --status',
-                f'sky status -r {name}-3 | grep UP',
-                f'sky launch -c {name}-3 --gpus h100 echo hi',
-                f'sky logs {name}-3 --status',
-                f'sky status -r {name}-3 | grep UP',
-            ],
-            f'sky down -y {name}-1 {name}-3',
-            env={'SKYPILOT_CONFIG': f.name},
-        )
-        smoke_tests_utils.run_one_test(test)
diff --git a/tests/test_yamls/minimal_test_required_before_merge.yaml b/tests/test_yamls/minimal_test_pre_merge.yaml
similarity index 60%
rename from tests/test_yamls/minimal_test_required_before_merge.yaml
rename to tests/test_yamls/minimal_test_pre_merge.yaml
index aceb5a76cb0..583575bee5c 100644
--- a/tests/test_yamls/minimal_test_required_before_merge.yaml
+++ b/tests/test_yamls/minimal_test_pre_merge.yaml
@@ -10,4 +10,4 @@ workdir: .
 num_nodes: 1
 
 run: |
-  ls -l ~/aws/tests/test_yamls/minimal_test_required_before_merge.yaml
+  ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml

From f29637fa2f7fddfff7ada61b35edfa9e0aa289a1 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 11 Dec 2024 18:30:29 +0800
Subject: [PATCH 61/64] fix import

---
 .buildkite/generate_pipeline.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 8f1389d409a..636923ae37a 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -7,9 +7,9 @@
 ├── test_*.py -> release pipeline
 ├── test_pre_merge.py -> pre-merge pipeline
 
-run `python .buildkite/generate_pipeline.py` to generate the pipeline for
-testing. The CI will run this script as a pre-step, and use the generated
-pipeline to run the tests.
+run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py`
+to generate the pipeline for testing. The CI will run this script as a pre-step,
+and use the generated pipeline to run the tests.
 
 1. release pipeline, which runs all smoke tests by default, generates all
    smoke tests for all clouds.
@@ -27,14 +27,9 @@
 import sys
 from typing import Any, Dict, List, Optional
 
-import yaml
-
-# Add project root to Python path
-tests_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'tests')
-sys.path.append(tests_path)
-
 from conftest import all_clouds_in_smoke_tests
 from conftest import default_clouds_to_run
+import yaml
 
 DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run
 ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests

From 010f4afed3a6989d7de8821386928b4acf5dd776 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 11 Dec 2024 18:48:20 +0800
Subject: [PATCH 62/64] fix import

---
 .buildkite/generate_pipeline.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 636923ae37a..62e304ffcda 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -24,15 +24,14 @@
 import ast
 import os
 import random
-import sys
 from typing import Any, Dict, List, Optional
 
-from conftest import all_clouds_in_smoke_tests
+from conftest import cloud_to_pytest_keyword
 from conftest import default_clouds_to_run
 import yaml
 
 DEFAULT_CLOUDS_TO_RUN = default_clouds_to_run
-ALL_CLOUDS_IN_SMOKE_TESTS = all_clouds_in_smoke_tests
+PYTEST_TO_CLOUD_KEYWORD = {v: k for k, v in cloud_to_pytest_keyword.items()}
 
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
 QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
@@ -119,10 +118,11 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
                         if suffix == 'serve':
                             is_serve_test = True
                             continue
-                        if suffix not in ALL_CLOUDS_IN_SMOKE_TESTS:
+                        if suffix not in PYTEST_TO_CLOUD_KEYWORD:
                             # This mark does not specify a cloud, so we skip it.
                             continue
-                        clouds_to_include.append(suffix)
+                        clouds_to_include.append(
+                            PYTEST_TO_CLOUD_KEYWORD[suffix])
             clouds_to_include = (clouds_to_include if clouds_to_include else
                                  DEFAULT_CLOUDS_TO_RUN)
             clouds_to_include = [

From 8d5d023ddbcfeabbfc9b284e6a363607e609d916 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 12 Dec 2024 10:51:59 +0800
Subject: [PATCH 63/64] support gcp on pre merge test

---
 .buildkite/generate_pipeline.py     |  3 +--
 tests/smoke_tests/test_pre_merge.py | 17 +++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 62e304ffcda..2b0f1cec788 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -222,8 +222,7 @@ def _convert_pre_merge(test_files: List[str]):
             'command': 'bash tests/backward_compatibility_tests.sh',
             'agents': {
                 'queue': 'back_compat'
-            },
-            'if': 'build.env("aws") == "1"'
+            }
         })
         output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py
index a2da638b8de..5254b289df1 100644
--- a/tests/smoke_tests/test_pre_merge.py
+++ b/tests/smoke_tests/test_pre_merge.py
@@ -2,29 +2,30 @@
 # Default options are set in pyproject.toml
 # Example usage:
 # Run all tests except for AWS and Lambda Cloud
-# > pytest tests/smoke_tests/test_required_before_merge.py
+# > pytest tests/smoke_tests/test_pre_merge.py
 #
 # Terminate failed clusters after test finishes
-# > pytest tests/smoke_tests/test_required_before_merge.py --terminate-on-failure
+# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure
 #
 # Re-run last failed tests
 # > pytest --lf
 #
 # Run one of the smoke tests
-# > pytest tests/smoke_tests/test_required_before_merge.py::test_yaml_launch_and_mount
+# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount
 #
 # Only run test for AWS + generic tests
-# > pytest tests/smoke_tests/test_required_before_merge.py --aws
-#
-# Change cloud for generic tests to aws
-# > pytest tests/smoke_tests/test_required_before_merge.py --generic-cloud aws
+# > pytest tests/smoke_tests/test_pre_merge.py --aws
 
+import pytest
 from smoke_tests import smoke_tests_utils
 
 import sky
 
 
-def test_yaml_launch_and_mount(generic_cloud: str):
+@pytest.mark.aws
+@pytest.mark.azure
+@pytest.mark.gcp
+def test_yaml_launch_and_mount():
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
         'test_yaml_launch_and_mount',

From 0bd7d044cc59692c1b16d1222c0bd17c01e66fce Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 12 Dec 2024 11:37:32 +0800
Subject: [PATCH 64/64] no gcp test case for pre merge

---
 tests/smoke_tests/test_pre_merge.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_pre_merge.py
index 5254b289df1..4890ac15ce4 100644
--- a/tests/smoke_tests/test_pre_merge.py
+++ b/tests/smoke_tests/test_pre_merge.py
@@ -15,17 +15,16 @@
 #
 # Only run test for AWS + generic tests
 # > pytest tests/smoke_tests/test_pre_merge.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws
 
-import pytest
 from smoke_tests import smoke_tests_utils
 
 import sky
 
 
-@pytest.mark.aws
-@pytest.mark.azure
-@pytest.mark.gcp
-def test_yaml_launch_and_mount():
+def test_yaml_launch_and_mount(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
         'test_yaml_launch_and_mount',