Skip to content

Commit

Permalink
[UX] Rephrase service initialization timeout (#3176)
Browse files Browse the repository at this point in the history
* fix

* Update sky/serve/serve_utils.py

Co-authored-by: Zongheng Yang <[email protected]>

* lint & rename vars

* change func name

---------

Co-authored-by: Zongheng Yang <[email protected]>
  • Loading branch information
cblmemo and concretevitamin authored Feb 18, 2024
1 parent 30d0a51 commit bd64e18
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 10 deletions.
4 changes: 2 additions & 2 deletions sky/serve/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# Signal file path for controller to handle signals.
SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}'

# Time to wait in seconds for service to initialize.
INITIALIZATION_TIMEOUT_SECONDS = 60
# Time to wait in seconds for service to register on the controller.
SERVICE_REGISTER_TIMEOUT_SECONDS = 60

# The time interval in seconds for load balancer to sync with controller. Every
# time the load balancer syncs with controller, it will update all available
Expand Down
4 changes: 2 additions & 2 deletions sky/serve/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ def up(
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
# change after the first time, so there is no consistency issue.
with rich_utils.safe_status(
'[cyan]Waiting for the service to initialize[/]'):
'[cyan]Waiting for the service to register[/]'):
# This function will check the controller job id in the database
# and return the endpoint if the job id matches. Otherwise it will
# return None.
code = serve_utils.ServeCodeGen.wait_service_initialization(
code = serve_utils.ServeCodeGen.wait_service_registration(
service_name, controller_job_id)
backend = backend_utils.get_backend_from_handle(controller_handle)
assert isinstance(backend, backends.CloudVmRayBackend)
Expand Down
21 changes: 15 additions & 6 deletions sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
return '\n'.join(messages)


def wait_service_initialization(service_name: str, job_id: int) -> str:
def wait_service_registration(service_name: str, job_id: int) -> str:
"""Util function to call at the end of `sky.serve.up()`.
This function will:
Expand Down Expand Up @@ -465,10 +465,19 @@ def wait_service_initialization(service_name: str, job_id: int) -> str:
raise RuntimeError('Max number of services reached. '
'To spin up more services, please '
'tear down some existing services.')
if time.time() - start_time > constants.INITIALIZATION_TIMEOUT_SECONDS:
elapsed = time.time() - start_time
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
# Print the controller log to help user debug.
controller_log_path = (
generate_remote_controller_log_file_name(service_name))
with open(os.path.expanduser(controller_log_path),
'r',
encoding='utf-8') as f:
log_content = f.read()
with ux_utils.print_exception_no_traceback():
raise ValueError(
f'Initialization of service {service_name!r} timeout.')
raise ValueError(f'Failed to register service {service_name!r} '
'on the SkyServe controller. '
f'Reason:\n{log_content}')
time.sleep(1)


Expand Down Expand Up @@ -836,9 +845,9 @@ def terminate_services(cls, service_names: Optional[List[str]],
return cls._build(code)

@classmethod
def wait_service_initialization(cls, service_name: str, job_id: int) -> str:
def wait_service_registration(cls, service_name: str, job_id: int) -> str:
code = [
'msg = serve_utils.wait_service_initialization('
'msg = serve_utils.wait_service_registration('
f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)'
]
return cls._build(code)
Expand Down

0 comments on commit bd64e18

Please sign in to comment.