From 9d6bf82224a473b5f5357a91c9e62d07e71a8e01 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 25 Mar 2024 12:54:59 +0800 Subject: [PATCH] [Serve] Make controller cloud choose from replica resources if it is not up (#3231) * enable * Apply suggestions from code review Co-authored-by: Ziming Mao --------- Co-authored-by: Ziming Mao --- sky/serve/core.py | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/sky/serve/core.py b/sky/serve/core.py index c41b860257f..1ebfaeba067 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -1,7 +1,8 @@ """SkyServe core APIs.""" import re import tempfile -from typing import Any, Dict, List, Optional, Union +import typing +from typing import Any, Dict, List, Optional, Set, Union import colorama @@ -24,6 +25,9 @@ from sky.utils import subprocess_utils from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from sky import clouds + logger = sky_logging.init_logger(__name__) @@ -122,6 +126,14 @@ def up( controller_utils.maybe_translate_local_file_mounts_and_sync_up(task, path='serve') + # If the controller and replicas are from the same cloud, it should + # provide better connectivity. We will let the controller choose from + # the clouds of the resources if the controller does not exist. + requested_clouds: Set['clouds.Cloud'] = set() + for resources in task.resources: + if resources.cloud is not None: + requested_clouds.add(resources.cloud) + with tempfile.NamedTemporaryFile( prefix=f'service-task-{service_name}-', mode='w', @@ -138,9 +150,11 @@ def up( serve_utils.generate_remote_config_yaml_file_name(service_name)) controller_log_file = ( serve_utils.generate_remote_controller_log_file_name(service_name)) - controller_resources = (controller_utils.get_controller_resources( - controller_type='serve', - controller_resources_config=serve_constants.CONTROLLER_RESOURCES)) + controller_resources_in_config = ( + controller_utils.get_controller_resources( + controller_type='serve', + controller_resources_config=serve_constants.CONTROLLER_RESOURCES + )) vars_to_fill = { 'remote_task_yaml_path': remote_tmp_task_yaml_path, @@ -160,17 +174,21 @@ def up( controller_exist = ( global_user_state.get_cluster_from_name(controller_name) is not None) - requested_cloud = list(task.resources)[0].cloud - controller_cloud = (requested_cloud if not controller_exist and - controller_resources.cloud is None else - controller_resources.cloud) + if (not controller_exist and + controller_resources_in_config.cloud is None): + controller_clouds = requested_clouds + else: + controller_clouds = {controller_resources_in_config.cloud} # TODO(tian): Probably run another sky.launch after we get the load # balancer port from the controller? So we don't need to open so many # ports here. Or, we should have a nginx traffic control to refuse # any connection to the unregistered ports. - controller_resources = controller_resources.copy( - cloud=controller_cloud, - ports=[serve_constants.LOAD_BALANCER_PORT_RANGE]) + controller_resources = { + controller_resources_in_config.copy( + cloud=controller_cloud, + ports=[serve_constants.LOAD_BALANCER_PORT_RANGE]) + for controller_cloud in controller_clouds + } controller_task.set_resources(controller_resources) # # Set service_name so the backend will know to modify default ray