diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 7dde6160068..6e19c490409 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3287,46 +3287,27 @@ def test_managed_jobs_recovery_multi_node_gcp(): def test_managed_jobs_retry_logs(): """Test managed job retry logs are properly displayed when a task fails.""" name = _get_cluster_name() - # Create a temporary YAML file with two tasks - first one fails, second succeeds - with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f: - yaml_content = textwrap.dedent(""" - resources: - cpus: 2+ - job_recovery: - max_restarts_on_errors: 1 - - # Task 1: Always fails - run: | - echo "Task 1 starting" - exit 1 - --- - # Task 2: Never reached due to Task 1 failure - run: | - echo "Task 2 starting" - exit 0 - """) - f.write(yaml_content) - f.flush() + yaml_path = 'tests/test_yamls/test_managed_jobs_retry.yaml' - with tempfile.NamedTemporaryFile(mode='w', suffix='.log') as log_file: - test = Test( - 'managed_jobs_retry_logs', - [ - f'sky jobs launch -n {name} {f.name} -y -d', - f'sky jobs logs -n {name} | tee {log_file.name}', - # First attempt - f'cat {log_file.name} | grep "Job started. Streaming logs..."', - f'cat {log_file.name} | grep "Job 1 failed"', - # Second attempt - f'cat {log_file.name} | grep "Job started. Streaming logs..." | wc -l | grep 2', - f'cat {log_file.name} | grep "Job 1 failed" | wc -l | grep 2', - # Task 2 is not reached - f'! cat {log_file.name} | grep "Job 2"', - ], - f'sky jobs cancel -y -n {name}', - timeout=7 * 60, # 5 mins - ) - run_one_test(test) + with tempfile.NamedTemporaryFile(mode='w', suffix='.log') as log_file: + test = Test( + 'managed_jobs_retry_logs', + [ + f'sky jobs launch -n {name} {yaml_path} -y -d', + f'sky jobs logs -n {name} | tee {log_file.name}', + # First attempt + f'cat {log_file.name} | grep "Job started. Streaming logs..."', + f'cat {log_file.name} | grep "Job 1 failed"', + # Second attempt + f'cat {log_file.name} | grep "Job started. Streaming logs..." | wc -l | grep 2', + f'cat {log_file.name} | grep "Job 1 failed" | wc -l | grep 2', + # Task 2 is not reached + f'! cat {log_file.name} | grep "Job 2"', + ], + f'sky jobs cancel -y -n {name}', + timeout=7 * 60, # 7 mins + ) + run_one_test(test) @pytest.mark.aws diff --git a/tests/test_yamls/test_managed_jobs_retry.yaml b/tests/test_yamls/test_managed_jobs_retry.yaml new file mode 100644 index 00000000000..76289986386 --- /dev/null +++ b/tests/test_yamls/test_managed_jobs_retry.yaml @@ -0,0 +1,14 @@ +resources: + cpus: 2+ + job_recovery: + max_restarts_on_errors: 1 + +# Task 1: Always fails +run: | + echo "Task 1 starting" + exit 1 +--- +# Task 2: Never reached due to Task 1 failure +run: | + echo "Task 2 starting" + exit 0 \ No newline at end of file