From ec8ff02d5372984999e2f6d795d24e60c1abe340 Mon Sep 17 00:00:00 2001 From: Manav Avlani Date: Tue, 18 Jun 2024 10:25:40 -0700 Subject: [PATCH] Add a new experimental restart policy for large scale model training (#922) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/922 TSIA Reviewed By: andywag Differential Revision: D58684341 --- torchx/specs/api.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/torchx/specs/api.py b/torchx/specs/api.py index 58dd3e9e..ba084ffe 100644 --- a/torchx/specs/api.py +++ b/torchx/specs/api.py @@ -237,11 +237,15 @@ class RetryPolicy(str, Enum): application to deal with failed replica departures and replacement replica admittance. 2. APPLICATION: Restarts the entire application. - + 3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas) + is not violated using extra hosts as spares. It does not really support + elasticity and just uses the delta between num_replicas and min_replicas + as spares (EXPERIMENTAL). """ REPLICA = "REPLICA" APPLICATION = "APPLICATION" + HOT_SPARE = "HOT_SPARE" class MountType(str, Enum): @@ -340,6 +344,8 @@ class Role: and num_replicas depending on the cluster resources and policies. If the scheduler doesn't support auto scaling this field is ignored and the job size will be num_replicas. + EXPERIMENTAL: For HOT_SPARE restart policy this field is used to + indicate the quorum required for the job to run. max_retries: max number of retries before giving up retry_policy: retry behavior upon replica failures resource: Resource requirement for the role. The role should be scheduled