From d5118fd24ee3912e2d4170d440ab9676fd08b89a Mon Sep 17 00:00:00 2001 From: Pete Walsh Date: Mon, 7 Oct 2024 10:06:39 -0700 Subject: [PATCH] Add `retry` field to `ExperimentSpec` (#290) * Add `retry` field to `TaskSpec` * Fix * fixes * let jobs run elsewhere * cancel in progress jobs * fix --- .github/workflows/main.yml | 2 +- CHANGELOG.md | 4 ++++ beaker/data_model/experiment_spec.py | 26 ++++++++++++++++++++++++++ integration_tests/experiments_test.py | 3 +-- integration_tests/jobs_test.py | 3 +-- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2c37423..9f7692e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: Main concurrency: group: ${{ github.workflow }}-${{ github.ref }} - # cancel-in-progress: true + cancel-in-progress: true on: pull_request: diff --git a/CHANGELOG.md b/CHANGELOG.md index 74280c2..9089414 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ use patch releases for compatibility fixes instead. ## Unreleased +### Added + +- Added `retry` field to `ExperimentSpec`. + ## [v1.31.3](https://github.com/allenai/beaker-py/releases/tag/v1.31.3) - 2024-08-30 ### Added diff --git a/beaker/data_model/experiment_spec.py b/beaker/data_model/experiment_spec.py index 243b54d..c1a2a56 100644 --- a/beaker/data_model/experiment_spec.py +++ b/beaker/data_model/experiment_spec.py @@ -18,6 +18,7 @@ "TaskContext", "TaskSpec", "SpecVersion", + "RetrySpec", "ExperimentSpec", "Constraints", ] @@ -705,6 +706,18 @@ class SpecVersion(StrEnum): v2_alpha = "v2-alpha" +class RetrySpec(BaseModel, frozen=False): + """ + Defines the retry behavior of an experiment. + """ + + allowed_task_retries: Optional[int] = None + """ + A positive integer specifying the maximum number of task retries allowed for the experiment, + with a max limit of 10. + """ + + class ExperimentSpec(BaseModel, frozen=False): """ Experiments are the main unit of execution in Beaker. @@ -749,6 +762,11 @@ class ExperimentSpec(BaseModel, frozen=False): Long-form explanation for an experiment. """ + retry: Optional[RetrySpec] = None + """ + Defines the retry behavior of an experiment. + """ + @field_validator("tasks") def _validate_tasks(cls, v: List[TaskSpec]) -> List[TaskSpec]: task_names = set() @@ -882,6 +900,14 @@ def with_description(self, description: str) -> "ExperimentSpec": """ return self.model_copy(deep=True, update={"description": description}) + def with_retries(self, allowed_task_retries: int) -> "ExperimentSpec": + """ + Return a new :class:`ExperimentSpec` with the given number of retries. + """ + return self.model_copy( + deep=True, update={"retry": RetrySpec(allowed_task_retries=allowed_task_retries)} + ) + def validate(self): for task in self.tasks: if (task.image.beaker is None) == (task.image.docker is None): diff --git a/integration_tests/experiments_test.py b/integration_tests/experiments_test.py index 3b8bf9a..21f6e66 100644 --- a/integration_tests/experiments_test.py +++ b/integration_tests/experiments_test.py @@ -16,7 +16,6 @@ def test_experiment_workflow( client: Beaker, experiment_name: str, alternate_experiment_name: str, - beaker_cluster_name: str, hello_world_experiment_name: str, ): spec = ExperimentSpec( @@ -25,7 +24,7 @@ def test_experiment_workflow( TaskSpec( name="main", image=ImageSource(docker="hello-world"), - context=TaskContext(cluster=beaker_cluster_name), + context=TaskContext(preemptible=True), result=ResultSpec(path="/unused"), # required even if the task produces no output. ), ], diff --git a/integration_tests/jobs_test.py b/integration_tests/jobs_test.py index 9570dd7..e929f17 100644 --- a/integration_tests/jobs_test.py +++ b/integration_tests/jobs_test.py @@ -3,12 +3,11 @@ from beaker import Beaker, ExperimentSpec, TaskSpec -def test_job_stop_and_finalize(client: Beaker, experiment_name: str, beaker_cluster_name: str): +def test_job_stop_and_finalize(client: Beaker, experiment_name: str): start = time.time() spec = ExperimentSpec(budget="ai2/allennlp").with_task( TaskSpec.new( "main", - beaker_cluster_name, docker_image="hello-world", ), )