Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
cblmemo committed Aug 28, 2023
1 parent b727ea1 commit f04036e
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 17 deletions.
26 changes: 17 additions & 9 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2619,6 +2619,13 @@ def _service_status_from_replica_info(
return status_lib.ServiceStatus.REPLICA_INIT


def _set_controller_unreachable_status(service_record: Dict[str, Any]) -> None:
service_record['status'] = status_lib.ServiceStatus.CONTROLLER_FAILED
service_handle: serve_lib.ServiceHandle = service_record['handle']
for info in service_handle.replica_info:
info['status'] = status_lib.ReplicaStatus.UNKNOWN


def _refresh_service_record_no_lock(
service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
"""Refresh the service, and return the possibly updated record.
Expand All @@ -2628,7 +2635,7 @@ def _refresh_service_record_no_lock(
Returns:
A tuple of a possibly updated record and an error message if any error
occurred when refreshing the service.
message that needs to be printed in CLI when refreshing the service.
"""
record = global_user_state.get_service_from_name(service_name)
if record is None:
Expand All @@ -2638,7 +2645,8 @@ def _refresh_service_record_no_lock(
try:
check_network_connection()
except exceptions.NetworkError:
return record, 'Failed to refresh replica info due to network error.'
return record, ('Failed to refresh replica info due to network error. '
'Using the cached record.')

if not service_handle.endpoint:
# Service controller is still initializing. Skipped refresh status.
Expand All @@ -2649,25 +2657,25 @@ def _refresh_service_record_no_lock(
controller_cluster_name)
if (cluster_record is None or
cluster_record['status'] != status_lib.ClusterStatus.UP):
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTRLLER_FAILED)
return record, (f'Controller cluster {controller_cluster_name!r} '
'is not found or UP.')
_set_controller_unreachable_status(record)
global_user_state.add_or_update_service(**record)
return record, None

handle = cluster_record['handle']
backend = get_backend_from_handle(handle)
assert isinstance(backend, backends.CloudVmRayBackend)

code = serve_lib.ServeCodeGen.get_latest_info()
returncode, latest_info_payload, stderr = backend.run_on_head(
returncode, latest_info_payload, _ = backend.run_on_head(
handle,
code,
require_outputs=True,
stream_logs=False,
separate_stderr=True)
if returncode != 0:
return record, ('Failed to refresh replica info from the controller. '
f'Using the cached record. Reason: {stderr}')
_set_controller_unreachable_status(record)
global_user_state.add_or_update_service(**record)
return record, None

latest_info = serve_lib.load_latest_info(latest_info_payload)
service_handle.replica_info = latest_info['replica_info']
Expand Down
16 changes: 15 additions & 1 deletion sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3970,7 +3970,7 @@ def serve_up(
service_name)
if previous_service_record is not None:
if previous_service_record['status'] in [
status_lib.ServiceStatus.CONTRLLER_FAILED,
status_lib.ServiceStatus.CONTROLLER_FAILED,
status_lib.ServiceStatus.FAILED
]:
prompt = (f'Service {service_name!r} has failed. '
Expand Down Expand Up @@ -4168,6 +4168,9 @@ def serve_status(all: bool, service_name: Optional[str]):
termination not finished correctly. When seeing this status, please login
to cloud console and check whether there are some resources not released.
- ``UNKNOWN``: The replica status is unknown. This usually happens when the
controller failure is detected and the replica status is not updated yet.
Examples:
.. code-block:: bash
Expand Down Expand Up @@ -4198,6 +4201,17 @@ def serve_status(all: bool, service_name: Optional[str]):
replica_infos.append(replica_record)
status_utils.show_replica_table(replica_infos, all)

failed_controllers = [
record['name']
for record in service_records
if record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED
]
if failed_controllers:
num_failed = len(failed_controllers)
plural = '' if num_failed == 1 else 's'
click.echo(f'\n* {num_failed} service{plural} with failed controller '
'found. The replica info and number might not be accurate.')


@serve.command('down', cls=_DocumentedCodeCommand)
@click.argument('service_names',
Expand Down
2 changes: 1 addition & 1 deletion sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def serve_tail_logs(service_record: Dict[str, Any], replica_id: int,
raise ValueError(
f'Service {service_name!r} is still initializing its '
'controller. Please try again later.')
if service_record['status'] == status_lib.ServiceStatus.CONTRLLER_FAILED:
if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED:
with ux_utils.print_exception_no_traceback():
raise ValueError(f'Service {service_name!r}\'s controller failed. '
'Cannot tail logs.')
Expand Down
6 changes: 3 additions & 3 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ def serve_up(
if (cluster_record is None or
cluster_record['status'] != status_lib.ClusterStatus.UP):
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTRLLER_FAILED)
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
print(f'{colorama.Fore.RED}Controller failed to launch. '
f'Please check the logs above.{colorama.Style.RESET_ALL}')
return
Expand Down Expand Up @@ -1109,7 +1109,7 @@ def _wait_until_job_is_running(cluster_name: str,
controller_cluster_name, 1)
if not controller_job_is_running:
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTRLLER_FAILED)
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
print(f'{colorama.Fore.RED}Controller failed to launch. '
f'Please check the logs with sky serve logs {service_name} '
f'--controller{colorama.Style.RESET_ALL}')
Expand All @@ -1136,7 +1136,7 @@ def _wait_until_job_is_running(cluster_name: str,
controller_cluster_name, 2)
if not redirector_job_is_running:
global_user_state.set_service_status(
service_name, status_lib.ServiceStatus.CONTRLLER_FAILED)
service_name, status_lib.ServiceStatus.CONTROLLER_FAILED)
print(f'{colorama.Fore.RED}Redirector failed to launch. '
f'Please check the logs with sky serve logs {service_name} '
f'--redirector{colorama.Style.RESET_ALL}')
Expand Down
6 changes: 3 additions & 3 deletions sky/status_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class ServiceStatus(enum.Enum):

# Controller failed to initialize / controller or redirector process
# status abnormal
CONTRLLER_FAILED = 'CONTROLLER_FAILED'
CONTROLLER_FAILED = 'CONTROLLER_FAILED'

# At least one replica is ready
READY = 'READY'
Expand All @@ -81,7 +81,7 @@ def colored_str(self):
_SERVICE_STATUS_TO_COLOR = {
ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE,
ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE,
ServiceStatus.CONTRLLER_FAILED: colorama.Fore.RED,
ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED,
ServiceStatus.READY: colorama.Fore.GREEN,
ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW,
ServiceStatus.FAILED: colorama.Fore.RED,
Expand Down Expand Up @@ -118,7 +118,7 @@ class ReplicaStatus(enum.Enum):
# leakage.
FAILED_CLEANUP = 'FAILED_CLEANUP'

# Unknown status. This should never happen.
# Unknown status. This should never happen except for controller failure.
UNKNOWN = 'UNKNOWN'

@classmethod
Expand Down

0 comments on commit f04036e

Please sign in to comment.