Skip to content

Commit

Permalink
better UX for auto restart
Browse files Browse the repository at this point in the history
  • Loading branch information
cblmemo committed Sep 14, 2023
1 parent 755114f commit b547b19
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 5 deletions.
12 changes: 10 additions & 2 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
from sky.utils.cli_utils import status_utils

if typing.TYPE_CHECKING:
from sky import serve as serve_lib
from sky.backends import backend as backend_lib

logger = sky_logging.init_logger(__name__)
Expand Down Expand Up @@ -4154,9 +4155,16 @@ def serve_status(all: bool, service_names: List[str]):
f'Replicas{colorama.Style.RESET_ALL}')
replica_infos = []
for service_record in service_records:
handle: 'serve_lib.ServiceHandle' = service_record['handle']
for replica_record in service_record['replica_info']:
replica_record['service_name'] = service_record['name']
replica_infos.append(replica_record)
# Only print FAILED replicas if:
# 1. --all is specified;
# 2. auto_restart is not enabled (in which FAILED replica count
# as one replica).
if (all or not handle.auto_restart or replica_record['status'] !=
status_lib.ReplicaStatus.FAILED):
replica_record['service_name'] = service_record['name']
replica_infos.append(replica_record)
status_utils.show_replica_table(replica_infos, all)

failed_controllers = [
Expand Down
3 changes: 2 additions & 1 deletion sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,8 @@ def serve_up(
service_name=service_name,
policy=task.service.policy_str(),
requested_resources=requested_resources,
requested_controller_resources=controller_resources)
requested_controller_resources=controller_resources,
auto_restart=task.service.auto_restart)
# Use filelock here to make sure only one process can write to database
# at the same time. Then we generate available controller name again to
# make sure even in race condition, we can still get the correct controller
Expand Down
4 changes: 4 additions & 0 deletions sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ class ServiceHandle(object):
- (required) Service autoscaling policy description str.
- (required) Service requested resources.
- (required) Service requested controller resources.
- (required) Whether the service have auto restart enabled.
- (optional) Service uptime.
- (optional) Service endpoint IP.
- (optional) Controller port.
Expand All @@ -317,6 +318,7 @@ def __init__(
policy: str,
requested_resources: 'sky.Resources',
requested_controller_resources: 'sky.Resources',
auto_restart: bool,
uptime: Optional[int] = None,
endpoint_ip: Optional[str] = None,
controller_port: Optional[int] = None,
Expand All @@ -331,6 +333,7 @@ def __init__(
self.policy = policy
self.requested_resources = requested_resources
self.requested_controller_resources = requested_controller_resources
self.auto_restart = auto_restart
self.controller_port = controller_port
self.load_balancer_port = load_balancer_port
self.job_id = job_id
Expand All @@ -345,6 +348,7 @@ def __repr__(self):
f'\n\trequested_resources={self.requested_resources},'
'\n\trequested_controller_resources='
f'{self.requested_controller_resources},'
f'\n\tauto_restart={self.auto_restart},'
f'\n\tcontroller_port={self.controller_port},'
f'\n\tload_balancer_port={self.load_balancer_port},'
f'\n\tjob_id={self.job_id},'
Expand Down
8 changes: 6 additions & 2 deletions sky/utils/cli_utils/status_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,11 +421,15 @@ def _get_uptime(service_record: _ServiceRecord) -> str:


def _get_replicas(service_record: _ServiceRecord) -> str:
ready_replica_num = 0
ready_replica_num, total_replica_num = 0, 0
auto_restart = _get_service_handle(service_record).auto_restart
for info in service_record['replica_info']:
if _get_status(info) == status_lib.ReplicaStatus.READY:
ready_replica_num += 1
total_replica_num = len(service_record['replica_info'])
# If auto restart enabled, not count FAILED replicas here.
if (not auto_restart or
_get_status(info) != status_lib.ReplicaStatus.FAILED):
total_replica_num += 1
return f'{ready_replica_num}/{total_replica_num}'


Expand Down

0 comments on commit b547b19

Please sign in to comment.