diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b1b1e470191..37511eeb155 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2682,27 +2682,29 @@ def _refresh_service_record_no_lock( if cluster_record is None: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, (f'Controller cluster {controller_name!r} ' - 'is not found.') + return record, None handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) if service_handle.controller_port is None: - return record, 'Controller task is not successfully launched.' + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) + return record, None code = serve_lib.ServeCodeGen.get_latest_info( service_handle.controller_port) - returncode, latest_info_payload, stderr = backend.run_on_head( + returncode, latest_info_payload, _ = backend.run_on_head( handle, code, require_outputs=True, stream_logs=False, separate_stderr=True) if returncode != 0: - return record, ('Failed to refresh replica info from the controller. ' - f'Using the cached record. Reason: {stderr}') + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) + return record, None latest_info = serve_lib.load_latest_info(latest_info_payload) service_handle.uptime = latest_info['uptime'] diff --git a/sky/cli.py b/sky/cli.py index 75aaabccd4b..770e2f4b848 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4204,6 +4204,17 @@ def serve_status(all: bool, service_names: List[str]): replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) + failed_controllers = [ + record['name'] + for record in service_records + if record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED + ] + if failed_controllers: + num_failed = len(failed_controllers) + plural = '' if num_failed == 1 else 's' + click.echo(f'\n* {num_failed} service{plural} with failed controller ' + 'found. The replica info and number might not be accurate.') + @serve.command('down', cls=_DocumentedCodeCommand) @click.argument('service_names',