diff --git a/sky/cli.py b/sky/cli.py index d8b9b23d74d..287a16b995e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -77,6 +77,7 @@ from sky.utils.cli_utils import status_utils if typing.TYPE_CHECKING: + from sky import serve as serve_lib from sky.backends import backend as backend_lib logger = sky_logging.init_logger(__name__) @@ -4154,9 +4155,16 @@ def serve_status(all: bool, service_names: List[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: + handle: 'serve_lib.ServiceHandle' = service_record['handle'] for replica_record in service_record['replica_info']: - replica_record['service_name'] = service_record['name'] - replica_infos.append(replica_record) + # Only print FAILED replicas if: + # 1. --all is specified; + # 2. auto_restart is not enabled (in which FAILED replica count + # as one replica). + if (all or not handle.auto_restart or replica_record['status'] != + status_lib.ReplicaStatus.FAILED): + replica_record['service_name'] = service_record['name'] + replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) failed_controllers = [ diff --git a/sky/execution.py b/sky/execution.py index c81d74e6373..3673e77964e 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1024,7 +1024,8 @@ def serve_up( service_name=service_name, policy=task.service.policy_str(), requested_resources=requested_resources, - requested_controller_resources=controller_resources) + requested_controller_resources=controller_resources, + auto_restart=task.service.auto_restart) # Use filelock here to make sure only one process can write to database # at the same time. Then we generate available controller name again to # make sure even in race condition, we can still get the correct controller diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 8f05b81c364..1058ffb7f34 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -299,6 +299,7 @@ class ServiceHandle(object): - (required) Service autoscaling policy description str. - (required) Service requested resources. - (required) Service requested controller resources. + - (required) Whether the service have auto restart enabled. - (optional) Service uptime. - (optional) Service endpoint IP. - (optional) Controller port. @@ -317,6 +318,7 @@ def __init__( policy: str, requested_resources: 'sky.Resources', requested_controller_resources: 'sky.Resources', + auto_restart: bool, uptime: Optional[int] = None, endpoint_ip: Optional[str] = None, controller_port: Optional[int] = None, @@ -331,6 +333,7 @@ def __init__( self.policy = policy self.requested_resources = requested_resources self.requested_controller_resources = requested_controller_resources + self.auto_restart = auto_restart self.controller_port = controller_port self.load_balancer_port = load_balancer_port self.job_id = job_id @@ -345,6 +348,7 @@ def __repr__(self): f'\n\trequested_resources={self.requested_resources},' '\n\trequested_controller_resources=' f'{self.requested_controller_resources},' + f'\n\tauto_restart={self.auto_restart},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' f'\n\tjob_id={self.job_id},' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 8df9f28b5e3..5b78609850a 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -421,11 +421,15 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: - ready_replica_num = 0 + ready_replica_num, total_replica_num = 0, 0 + auto_restart = _get_service_handle(service_record).auto_restart for info in service_record['replica_info']: if _get_status(info) == status_lib.ReplicaStatus.READY: ready_replica_num += 1 - total_replica_num = len(service_record['replica_info']) + # If auto restart enabled, not count FAILED replicas here. + if (not auto_restart or + _get_status(info) != status_lib.ReplicaStatus.FAILED): + total_replica_num += 1 return f'{ready_replica_num}/{total_replica_num}'