Skip to content

Commit

Permalink
ux
Browse files Browse the repository at this point in the history
  • Loading branch information
cblmemo committed Sep 12, 2023
1 parent 9a2d27e commit f4bdbdb
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
14 changes: 8 additions & 6 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2682,27 +2682,29 @@ def _refresh_service_record_no_lock(
if cluster_record is None:
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
return record, (f'Controller cluster {controller_name!r} '
'is not found.')
return record, None

handle = cluster_record['handle']
backend = get_backend_from_handle(handle)
assert isinstance(backend, backends.CloudVmRayBackend)

if service_handle.controller_port is None:
return record, 'Controller task is not successfully launched.'
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
return record, None

code = serve_lib.ServeCodeGen.get_latest_info(
service_handle.controller_port)
returncode, latest_info_payload, stderr = backend.run_on_head(
returncode, latest_info_payload, _ = backend.run_on_head(
handle,
code,
require_outputs=True,
stream_logs=False,
separate_stderr=True)
if returncode != 0:
return record, ('Failed to refresh replica info from the controller. '
f'Using the cached record. Reason: {stderr}')
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
return record, None

latest_info = serve_lib.load_latest_info(latest_info_payload)
service_handle.uptime = latest_info['uptime']
Expand Down
11 changes: 11 additions & 0 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4204,6 +4204,17 @@ def serve_status(all: bool, service_names: List[str]):
replica_infos.append(replica_record)
status_utils.show_replica_table(replica_infos, all)

failed_controllers = [
record['name']
for record in service_records
if record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED
]
if failed_controllers:
num_failed = len(failed_controllers)
plural = '' if num_failed == 1 else 's'
click.echo(f'\n* {num_failed} service{plural} with failed controller '
'found. The replica info and number might not be accurate.')


@serve.command('down', cls=_DocumentedCodeCommand)
@click.argument('service_names',
Expand Down

0 comments on commit f4bdbdb

Please sign in to comment.