From a2f86dabe8958aab6b14152fbc61ed2fb5ba6582 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Wed, 30 Oct 2024 15:55:25 -0700 Subject: [PATCH] [test] smoke test fixes for managed jobs (#4217) * [test] don't wait for old pending jobs controller messages `sky jobs queue` used to output a temporary "waiting" message while the managed jobs controller was still being provisioned/starting. Since #3288 this is not shown, and instead the queued jobs themselves will show PENDING/STARTING. This also requires some changes to tests to permit the PENDING and STARTING states for managed jobs. * fix default aws region * [test] wait for RECOVERING more quickly Smoke tests were failing because some managed jobs were fulling recovering back to the RUNNING state before the smoke test could catch the RECOVERING case (see e.g. #4192 `test_managed_jobs_cancellation_gcp`). Change tests that manually terminate a managed job instance, so that they will wait for the managed job to change away from the RUNNING state, checking every 10s. * address PR comments * fix --- tests/conftest.py | 2 +- tests/test_smoke.py | 221 +++++++++++++++++++++----------------------- 2 files changed, 108 insertions(+), 115 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bb79abfe61e..aa0d0c88289 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -206,7 +206,7 @@ def enable_all_clouds(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture def aws_config_region(monkeypatch: pytest.MonkeyPatch) -> str: from sky import skypilot_config - region = 'us-west-2' + region = 'us-east-2' if skypilot_config.loaded(): ssh_proxy_command = skypilot_config.get_nested( ('aws', 'ssh_proxy_command'), None) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 2dcb39ce1c9..cfe7652e693 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -85,24 +85,15 @@ 'touch ~/.ssh/id_rsa.pub' ] -# Wait until the jobs controller is not in INIT state. -# This is a workaround for the issue that when multiple job tests -# are running in parallel, the jobs controller may be in INIT and -# the job queue/cancel command will return staled table. -_JOB_QUEUE_WAIT = ('s=$(sky jobs queue); ' - 'until ! echo "$s" | grep "jobs will not be shown until"; ' - 'do echo "Waiting for job queue to be ready..."; ' - 'sleep 5; s=$(sky jobs queue); done; echo "$s"; ' - 'echo; echo; echo "$s"') -_JOB_CANCEL_WAIT = ( - 's=$(sky jobs cancel -y -n {job_name}); ' - 'until ! echo "$s" | grep "Please wait for the controller to be ready."; ' - 'do echo "Waiting for the jobs controller ' - 'to be ready"; sleep 5; s=$(sky jobs cancel -y -n {job_name}); ' - 'done; echo "$s"; echo; echo; echo "$s"') -# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues -# when the controller being on Azure, which takes a long time for launching -# step. +# Get the job queue, and print it once on its own, then print it again to +# use with grep by the caller. +_GET_JOB_QUEUE = 's=$(sky jobs queue); echo "$s"; echo "$s"' +# Wait for a job to be not in RUNNING state. Used to check for RECOVERING. +_JOB_WAIT_NOT_RUNNING = ( + 's=$(sky jobs queue);' + 'until ! echo "$s" | grep "{job_name}" | grep "RUNNING"; do ' + 'sleep 10; s=$(sky jobs queue);' + 'echo "Waiting for job to stop RUNNING"; echo "$s"; done') DEFAULT_CMD_TIMEOUT = 15 * 60 @@ -2643,6 +2634,9 @@ def test_stop_gcp_spot(): # ---------- Testing managed job ---------- +# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues +# when the controller being on Azure, which takes a long time for launching +# step. @pytest.mark.managed_jobs def test_managed_jobs(generic_cloud: str): """Test the managed jobs yaml.""" @@ -2653,22 +2647,21 @@ def test_managed_jobs(generic_cloud: str): f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d', f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "STARTING\|RUNNING"', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "STARTING\|RUNNING"', - _JOB_CANCEL_WAIT.format(job_name=f'{name}-1'), + f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"', + f'sky jobs cancel -y -n {name}-1', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep CANCELLED', + f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep CANCELLED', # Test the functionality for logging. f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"', f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"', ], - # TODO(zhwu): Change to _JOB_CANCEL_WAIT.format(job_name=f'{name}-1 -n {name}-2') when + # TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when # canceling multiple job names is supported. - (_JOB_CANCEL_WAIT.format(job_name=f'{name}-1') + '; ' + - _JOB_CANCEL_WAIT.format(job_name=f'{name}-2')), + f'sky jobs cancel -y -n {name}-1; sky jobs cancel -y -n {name}-2', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) @@ -2690,26 +2683,26 @@ def test_job_pipeline(generic_cloud: str): [ f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING\|RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"', # `grep -A 4 {name}` finds the job with {name} and the 4 lines # after it, i.e. the 4 tasks within the job. # `sed -n 2p` gets the second line of the 4 lines, i.e. the first # task within the job. - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "PENDING"', - _JOB_CANCEL_WAIT.format(job_name=f'{name}'), + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"', + f'sky jobs cancel -y -n {name}', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', - f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', ], - _JOB_CANCEL_WAIT.format(job_name=f'{name}'), + f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=30 * 60, ) @@ -2732,9 +2725,9 @@ def test_managed_jobs_failed_setup(generic_cloud: str): f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', 'sleep 330', # Make sure the job failed quickly. - f'{_JOB_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) @@ -2757,17 +2750,17 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str): f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml', 'sleep 600', # Make sure the job failed quickly. - f'{_JOB_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"', # Task 0 should be SUCCEEDED. - f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"', # Task 1 should be FAILED_SETUP. - f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"', # Task 2 should be CANCELLED. - f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"', # Task 3 should be CANCELLED. - f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=30 * 60, ) @@ -2790,7 +2783,7 @@ def test_managed_jobs_recovery_aws(aws_config_region): [ f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', 'sleep 360', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -2798,13 +2791,13 @@ def test_managed_jobs_recovery_aws(aws_config_region): f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}* ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - 'sleep 100', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -2830,17 +2823,17 @@ def test_managed_jobs_recovery_gcp(): [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', 'sleep 360', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the cluster manually. terminate_cmd, - 'sleep 60', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -2861,7 +2854,7 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d', 'sleep 400', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -2878,16 +2871,16 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region): f'-{user_hash} ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - 'sleep 100', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -2912,7 +2905,7 @@ def test_managed_jobs_pipeline_recovery_gcp(): [ f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d', 'sleep 400', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids', # Terminate the cluster manually. @@ -2921,16 +2914,16 @@ def test_managed_jobs_pipeline_recovery_gcp(): # separated by `-`. (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - 'sleep 60', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 200', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"', f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new', f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new', f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -2951,9 +2944,9 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str): [ f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d', 'sleep 360', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|RECOVERING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|RECOVERING"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -2972,7 +2965,7 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): [ f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', 'sleep 450', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' @@ -2981,13 +2974,13 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region): 'Name=tag:ray-node-type,Values=worker ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - 'sleep 50', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 560', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=30 * 60, ) run_one_test(test) @@ -3013,17 +3006,17 @@ def test_managed_jobs_recovery_multi_node_gcp(): [ f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d', 'sleep 400', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id', # Terminate the worker manually. terminate_cmd, - 'sleep 50', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"', + _JOB_WAIT_NOT_RUNNING.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', 'sleep 420', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"', f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', timeout=25 * 60, ) run_one_test(test) @@ -3046,12 +3039,12 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d', 'sleep 60', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"', - _JOB_CANCEL_WAIT.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"', + f'sky jobs cancel -y -n {name}', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3060,11 +3053,11 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', 'sleep 300', - _JOB_CANCEL_WAIT.format(job_name=f'{name}-2'), + f'sky jobs cancel -y -n {name}-2', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', (f's=$(aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* ' f'--query Reservations[].Instances[].State[].Name ' @@ -3073,20 +3066,20 @@ def test_managed_jobs_cancellation_aws(aws_config_region): # Test cancellation during spot job is recovering. f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d', 'sleep 300', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', # Terminate the cluster manually. (f'aws ec2 terminate-instances --region {region} --instance-ids $(' f'aws ec2 describe-instances --region {region} ' f'--filters Name=tag:ray-cluster-name,Values={name_3_on_cloud}-* ' f'--query Reservations[].Instances[].InstanceId ' '--output text)'), - 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"', - _JOB_CANCEL_WAIT.format(job_name=f'{name}-3'), + _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + f'sky jobs cancel -y -n {name}-3', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$(aws ec2 describe-instances --region {region} ' @@ -3122,33 +3115,33 @@ def test_managed_jobs_cancellation_gcp(): # Test cancellation during spot cluster being launched. f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d', 'sleep 60', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"', - _JOB_CANCEL_WAIT.format(job_name=name), + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"', + f'sky jobs cancel -y -n {name}', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"', # Test cancelling the spot cluster during spot job being setup. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d', 'sleep 300', - _JOB_CANCEL_WAIT.format(job_name=f'{name}-2'), + f'sky jobs cancel -y -n {name}-2', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"', # Test cancellation during spot job is recovering. f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d', 'sleep 300', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"', # Terminate the cluster manually. terminate_cmd, - 'sleep 80', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"', - _JOB_CANCEL_WAIT.format(job_name=f'{name}-3'), + _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'), + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"', + f'sky jobs cancel -y -n {name}-3', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"', 'sleep 120', - f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"', + f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"', # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because # there can be multiple VM with the same name due to the recovery. (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"' @@ -3239,12 +3232,12 @@ def test_managed_jobs_storage(generic_cloud: str): f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y', region_validation_cmd, # Check if the bucket is created in the correct region 'sleep 60', # Wait the spot queue to be updated - f'{_JOB_QUEUE_WAIT}| grep {name} | grep SUCCEEDED', + f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]', # Check if file was written to the mounted output bucket output_check_cmd ], - (_JOB_CANCEL_WAIT.format(job_name=name), + (f'sky jobs cancel -y -n {name}', f'; sky storage delete {output_storage_name} || true'), # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, @@ -3264,11 +3257,11 @@ def test_managed_jobs_tpu(): [ f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d', 'sleep 5', - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep STARTING', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep STARTING', 'sleep 900', # TPU takes a while to launch - f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"', + f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) @@ -3285,9 +3278,9 @@ def test_managed_jobs_inline_env(generic_cloud: str): [ f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"', 'sleep 20', - f'{_JOB_QUEUE_WAIT} | grep {name} | grep SUCCEEDED', + f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED', ], - _JOB_CANCEL_WAIT.format(job_name=name), + f'sky jobs cancel -y -n {name}', # Increase timeout since sky jobs queue -r can be blocked by other spot tests. timeout=20 * 60, ) @@ -5613,7 +5606,7 @@ def test_sky_bench(generic_cloud: str): @pytest.mark.kubernetes def test_kubernetes_context_failover(): """Test if the kubernetes context failover works. - + This test requires two kubernetes clusters: - kind-skypilot: the local cluster with mock labels for 8 H100 GPUs. - another accessible cluster: with enough CPUs