From ec8ff02d5372984999e2f6d795d24e60c1abe340 Mon Sep 17 00:00:00 2001
From: Manav Avlani <avlanimanav@meta.com>
Date: Tue, 18 Jun 2024 10:25:40 -0700
Subject: [PATCH] Add a new experimental restart policy for large scale model
 training (#922)

Summary:
Pull Request resolved: https://github.com/pytorch/torchx/pull/922

TSIA

Reviewed By: andywag

Differential Revision: D58684341
---
 torchx/specs/api.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torchx/specs/api.py b/torchx/specs/api.py
index 58dd3e9e..ba084ffe 100644
--- a/torchx/specs/api.py
+++ b/torchx/specs/api.py
@@ -237,11 +237,15 @@ class RetryPolicy(str, Enum):
                 application to deal with failed replica departures and
                 replacement replica admittance.
     2. APPLICATION: Restarts the entire application.
-
+    3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas)
+                is not violated using extra hosts as spares. It does not really support
+                elasticity and just uses the delta between num_replicas and min_replicas
+                as spares (EXPERIMENTAL).
     """
 
     REPLICA = "REPLICA"
     APPLICATION = "APPLICATION"
+    HOT_SPARE = "HOT_SPARE"
 
 
 class MountType(str, Enum):
@@ -340,6 +344,8 @@ class Role:
                 and num_replicas depending on the cluster resources and
                 policies. If the scheduler doesn't support auto scaling this
                 field is ignored and the job size will be num_replicas.
+                EXPERIMENTAL: For HOT_SPARE restart policy this field is used to
+                indicate the quorum required for the job to run.
             max_retries: max number of retries before giving up
             retry_policy: retry behavior upon replica failures
             resource: Resource requirement for the role. The role should be scheduled