From 87e0c6c6a8bf22f6d7e4a701107eb06bc58898a9 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 6 Jan 2024 03:31:42 +0000 Subject: [PATCH] Add 'TERMINATED' to ReplicaStatus --- sky/serve/replica_managers.py | 9 +++++++++ sky/serve/serve_state.py | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 8c997b28992..aaa0912a65e 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -285,6 +285,9 @@ def is_scale_down_succeeded(self, initial_delay_seconds: int) -> bool: notify the user that something is wrong with the user code / setup. """ logger.info('is_scale_down_succeeded(1)') + logger.info(f'is_scale_down_succeeded(self.sky_launch_status):{self.sky_launch_status}') + logger.info(f'is_scale_down_succeeded(self.sky_down_status):{self.sky_down_status}') + if self.sky_launch_status != ProcessStatus.SUCCEEDED: logger.info('is_scale_down_succeeded(2)') return False @@ -343,6 +346,11 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: # Pending to launch return serve_state.ReplicaStatus.PENDING if self.sky_launch_status == ProcessStatus.RUNNING: + # replica is terminated, but the ReplicaInfo is kept as some + # failure detected. + if self.sky_down_status is not None: + if self.sky_down_status == ProcessStatus.SUCCEEDED: + return serve_state.ReplicaStatus.TERMINATED # Still launching return serve_state.ReplicaStatus.PROVISIONING if self.sky_down_status is not None: @@ -841,6 +849,7 @@ def _refresh_process_pool(self) -> None: logger.info(f'Termination of replica {replica_id} ' 'finished. Replica info is kept since some ' 'failure detected.') + info.status_property serve_state.add_or_update_replica(self._service_name, replica_id, info) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index b09dd063c90..f09a2f34716 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -95,6 +95,10 @@ class ReplicaStatus(enum.Enum): # Unknown. This should never happen (used only for unexpected errors). UNKNOWN = 'UNKNOWN' + + # The replica successfully terminated, but the 'ReplicaInfo' is kept since + # some failure detected. + TERMINATED = 'TERMINATED' @classmethod def failed_statuses(cls) -> List['ReplicaStatus']: