From 34598a285c805838cd36d6f2aefb2e76415a6ab7 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 1 Feb 2024 08:05:07 -0800 Subject: [PATCH 01/85] playing around --- sky/serve/controller.py | 2 +- sky/serve/core.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 5b28855b1d1..2e1ec54b5a7 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -137,7 +137,7 @@ def configure_logger(): logger.info('SkyServe Controller started on ' f'http://localhost:{self._port}') - uvicorn.run(self._app, host='localhost', port=self._port) + uvicorn.run(self._app, host='0.0.0.0', port=self._port) # TODO(tian): Probably we should support service that will stop the VM in diff --git a/sky/serve/core.py b/sky/serve/core.py index 1ac2202aa1f..432560d814a 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -168,8 +168,6 @@ def up( stream_logs=False, cluster_name=controller_name, detach_run=True, - idle_minutes_to_autostop=constants. - CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) From 95b1eb071ae1a5a81642d6b02ca4e30422b080c5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 2 Feb 2024 12:44:11 -0800 Subject: [PATCH 02/85] wip with hacks --- sky/adaptors/kubernetes.py | 2 ++ sky/clouds/kubernetes.py | 12 +++++++++++- sky/serve/core.py | 6 ++++++ sky/utils/controller_utils.py | 10 ++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index 093274044d4..d7a856b73a6 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -62,6 +62,8 @@ def _load_config(): # See issue: https://github.com/skypilot-org/skypilot/issues/2287 os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc' os.environ['KUBERNETES_SERVICE_PORT'] = '443' + # TODO(romilb) - THIS IS HACK! FIX THIS BEFORE MERGING. + raise kubernetes.config.config_exception.ConfigException kubernetes.config.load_incluster_config() except kubernetes.config.config_exception.ConfigException: try: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 4ecc9a7a121..295ac6d12da 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -341,7 +341,17 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: f'check if {CREDENTIAL_PATH} exists.') def get_credential_file_mounts(self) -> Dict[str, str]: - return {CREDENTIAL_PATH: CREDENTIAL_PATH} + # Okay, I need to + # 1) Generate a SA with the right permissions + # 2) Generate a kubeconfig file with the SA token + # 3) Mount the kubeconfig file into the container + # 4) Additionally, check in the container during auth if the API server IP is reachable. If not, resolve it using the service DNS and use that as the IP. + # 5) Use the same sky-key for all containers in the pod. + return { + # TODO(romilb): This is a hack! Fix this before merging. + '~/.ssh/sky-key': '~/.ssh/sky-key', + '~/.ssh/sky-key.pub': '~/.ssh/sky-key.pub', + CREDENTIAL_PATH: CREDENTIAL_PATH} def instance_type_exists(self, instance_type: str) -> bool: return kubernetes_utils.KubernetesInstanceType.is_valid_instance_type( diff --git a/sky/serve/core.py b/sky/serve/core.py index 432560d814a..ff44efc843c 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -8,6 +8,7 @@ import sky from sky import backends +from sky import clouds from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -163,11 +164,16 @@ def up( # whether the service is already running. If the id is the same # with the current job id, we know the service is up and running # for the first time; otherwise it is a name conflict. + idle_minutes_to_autostop = None if ( + controller_cloud and + controller_cloud.is_same_cloud(clouds.Kubernetes()) + ) else constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP controller_job_id, controller_handle = execution.execute( entrypoint=controller_task, stream_logs=False, cluster_name=controller_name, detach_run=True, + idle_minutes_to_autostop=idle_minutes_to_autostop, retry_until_up=True, ) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 3096dd305a5..27bc9e4e48a 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -161,6 +161,16 @@ def _get_cloud_dependencies_installation_commands( 'pip list | grep azure-cli > /dev/null 2>&1 || ' 'pip install azure-cli>=2.31.0 azure-core azure-identity>=1.13.0 ' 'azure-mgmt-network > /dev/null 2>&1') + if any( + cloud.is_same_cloud(clouds.Kubernetes()) + for cloud in global_user_state.get_enabled_clouds()): + commands.append( + # Install k8s + skypilot dependencies + 'sudo bash -c "apt update && apt install curl socat netcat -y" && ' + # Install kubectl + 'curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' + 'sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && ' + ) return commands From f93e6ddc5a3c90eab0cba0e1b365831b9aedd2f5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 5 Feb 2024 15:32:50 -0800 Subject: [PATCH 03/85] wip refactor get_endpoints --- sky/cli.py | 74 ++++++--------------------------- sky/core.py | 77 +++++++++++++++++++++++++++++++++++ sky/provision/aws/instance.py | 14 ++++++- sky/provision/common.py | 30 +++++++------- 4 files changed, 116 insertions(+), 79 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 2b40ba074c9..25664a54bc0 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1991,69 +1991,19 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, head_ip = handle.external_ips()[0] if show_endpoints: - launched_resources = handle.launched_resources - cloud = launched_resources.cloud - try: - cloud.check_features_are_supported( - launched_resources, - {clouds.CloudImplementationFeatures.OPEN_PORTS}) - except exceptions.NotSupportedError: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Querying endpoints is not supported ' - f'for {cloud}.') from None - - config = common_utils.read_yaml(handle.cluster_yaml) - port_details = provision_lib.query_ports( - repr(cloud), handle.cluster_name_on_cloud, - handle.launched_resources.ports, config['provider']) - - if endpoint is not None: - # If cluster had no ports to be exposed - if str(endpoint) not in handle.launched_resources.ports: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Port {endpoint} is not exposed ' - 'on cluster ' - f'{cluster_record["name"]!r}.') - # If the user requested a specific port endpoint - if endpoint not in port_details: - error_msg = (f'Port {endpoint} not exposed yet. ' - f'{_ENDPOINTS_RETRY_MESSAGE} ') - if handle.launched_resources.cloud.is_same_cloud( - clouds.Kubernetes()): - # Add Kubernetes specific debugging info - error_msg += ( - kubernetes_utils.get_endpoint_debug_message()) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) - click.echo(port_details[endpoint][0].url(ip=head_ip)) - return - - if not port_details: - # If cluster had no ports to be exposed - if handle.launched_resources.ports is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Cluster does not have any ports ' - 'to be exposed.') - # Else wait for the ports to be exposed - else: - error_msg = (f'No endpoints exposed yet. ' - f'{_ENDPOINTS_RETRY_MESSAGE} ') - if handle.launched_resources.cloud.is_same_cloud( - clouds.Kubernetes()): - # Add Kubernetes specific debugging info - error_msg += \ - kubernetes_utils.get_endpoint_debug_message() - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) - - for port, urls in port_details.items(): - click.echo( - f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' - f'{colorama.Style.RESET_ALL}: ' - f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'{urls[0].url(ip=head_ip)}{colorama.Style.RESET_ALL}') + query_endpoint = endpoint # Either port number of none if endpoints is ued + cluster_endpoints = core.get_endpoints(cluster_record['name'], + query_endpoint) + if query_endpoint: + click.echo(cluster_endpoints[endpoint]) + else: + for port, port_endpoint in cluster_endpoints.items(): + click.echo( + f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' + f'{colorama.Style.RESET_ALL}: ' + f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'{port_endpoint}{colorama.Style.RESET_ALL}') return - click.echo(head_ip) return hints = [] diff --git a/sky/core.py b/sky/core.py index d7174e1522e..d762160027e 100644 --- a/sky/core.py +++ b/sky/core.py @@ -12,14 +12,19 @@ from sky import data from sky import exceptions from sky import global_user_state +from sky import provision as provision_lib from sky import sky_logging from sky import spot from sky import status_lib from sky import task from sky.backends import backend_utils +from sky.provision import common as provision_common +# TODO(romilb): This is a bad import - refactor to avoid this. +from sky.provision.kubernetes import utils as kubernetes_utils from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib +from sky.utils import common_utils from sky.utils import controller_utils from sky.utils import rich_utils from sky.utils import subprocess_utils @@ -997,3 +1002,75 @@ def storage_delete(name: str) -> None: source=handle.source, sync_on_reconstruction=False) store_object.delete() + +_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' + 'please retry after a while.') + +def get_endpoints(cluster: str, endpoint: Optional[int]) -> Dict[int, provision_common.Endpoint]: + cluster_records = status(cluster_names=[cluster]) + #TODO(romilb): Add error message for > 1 cluster records here before merging. + cluster_record = cluster_records[0] + if cluster_record['status'] != status_lib.ClusterStatus.UP: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' + 'is not in UP status.') + handle = cluster_record['handle'] + if not isinstance(handle, backends.CloudVmRayResourceHandle): + with ux_utils.print_exception_no_traceback(): + raise ValueError('Querying IP address is not supported ' + 'for local clusters.') + + launched_resources = handle.launched_resources + cloud = launched_resources.cloud + try: + cloud.check_features_are_supported( + launched_resources, + {clouds.CloudImplementationFeatures.OPEN_PORTS}) + except exceptions.NotSupportedError: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Querying endpoints is not supported ' + f'for {cloud}.') from None + + config = common_utils.read_yaml(handle.cluster_yaml) + port_details = provision_lib.query_ports( + repr(cloud), handle.cluster_name_on_cloud, + handle.launched_resources.ports, config['provider']) + + if endpoint is not None: + # If cluster had no ports to be exposed + if str(endpoint) not in handle.launched_resources.ports: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Port {endpoint} is not exposed ' + 'on cluster ' + f'{cluster_record["name"]!r}.') + # If the user requested a specific port endpoint + if endpoint not in port_details: + error_msg = (f'Port {endpoint} not exposed yet. ' + f'{_ENDPOINTS_RETRY_MESSAGE} ') + if handle.launched_resources.cloud.is_same_cloud( + clouds.Kubernetes()): + # Add Kubernetes specific debugging info + error_msg += ( + kubernetes_utils.get_endpoint_debug_message()) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(error_msg) + return {endpoint: port_details[endpoint][0].url()} + else: + if not port_details: + # If cluster had no ports to be exposed + if handle.launched_resources.ports is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Cluster does not have any ports ' + 'to be exposed.') + # Else wait for the ports to be exposed + else: + error_msg = (f'No endpoints exposed yet. ' + f'{_ENDPOINTS_RETRY_MESSAGE} ') + if handle.launched_resources.cloud.is_same_cloud( + clouds.Kubernetes()): + # Add Kubernetes specific debugging info + error_msg += \ + kubernetes_utils.get_endpoint_debug_message() + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(error_msg) + return {port: urls[0].url() for port, urls in port_details.items()} \ No newline at end of file diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index 54178cc78da..af4ecc03cca 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -817,5 +817,15 @@ def query_ports( provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" - return common.query_ports_passthrough(cluster_name_on_cloud, ports, - provider_config) + cluster_info = get_cluster_info(provider_config['region'], + cluster_name_on_cloud, + provider_config=provider_config) + head_instance = cluster_info.instances.get(cluster_info.head_instance_id) + if head_instance is None: + return {} + head_ip = head_instance[0].external_ip + ports = list(resources_utils.port_ranges_to_set(ports)) + result: Dict[int, List[common.Endpoint]] = {} + for port in ports: + result[port] = [common.SocketEndpoint(port=port, host=head_ip)] + return result \ No newline at end of file diff --git a/sky/provision/common.py b/sky/provision/common.py index 75178b15623..5c8afde769b 100644 --- a/sky/provision/common.py +++ b/sky/provision/common.py @@ -201,7 +201,7 @@ class Endpoint: pass @abc.abstractmethod - def url(self, ip: str): + def url(self, override_ip: Optional[str] = None) -> str: raise NotImplementedError @@ -211,30 +211,29 @@ class SocketEndpoint(Endpoint): port: Optional[int] host: str = '' - def url(self, ip: str): - if not self.host: - self.host = ip - return f'{self.host}{":" + str(self.port) if self.port else ""}' + def url(self, override_ip: Optional[str] = None) -> str: + host = override_ip if override_ip else self.host + return f'{host}{":" + str(self.port) if self.port else ""}' @dataclasses.dataclass class HTTPEndpoint(SocketEndpoint): - """HTTP endpoint accesible via a url.""" + """HTTP endpoint accessible via a url.""" path: str = '' - def url(self, ip: str): - del ip # Unused. - return f'http://{os.path.join(super().url(self.host), self.path)}' + def url(self, override_ip: Optional[str] = None) -> str: + host = override_ip if override_ip else self.host + return f'http://{os.path.join(super().url(host), self.path)}' @dataclasses.dataclass class HTTPSEndpoint(SocketEndpoint): - """HTTPS endpoint accesible via a url.""" + """HTTPS endpoint accessible via a url.""" path: str = '' - def url(self, ip: str): - del ip # Unused. - return f'https://{os.path.join(super().url(self.host), self.path)}' + def url(self, override_ip: Optional[str] = None) -> str: + host = override_ip if override_ip else self.host + return f'https://{os.path.join(super().url(host), self.path)}' def query_ports_passthrough( @@ -246,9 +245,10 @@ def query_ports_passthrough( Returns a list of socket endpoint with empty host and the input ports.""" del cluster_name_on_cloud, provider_config # Unused. + handle = cluster_record['handle'] + head_ip = handle.external_ips()[0] ports = list(port_ranges_to_set(ports)) result: Dict[int, List[Endpoint]] = {} for port in ports: - result[port] = [SocketEndpoint(port=port)] - + result[port] = [SocketEndpoint(port=port, host=head_ip)] return result From c8276bb629de36318ed0b00636b24af0353f870f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 5 Feb 2024 16:51:49 -0800 Subject: [PATCH 04/85] working get_endpoints --- sky/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index d762160027e..fe6f196ff88 100644 --- a/sky/core.py +++ b/sky/core.py @@ -26,6 +26,7 @@ from sky.usage import usage_lib from sky.utils import common_utils from sky.utils import controller_utils +from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import ux_utils @@ -1038,7 +1039,8 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> Dict[int, provision_ if endpoint is not None: # If cluster had no ports to be exposed - if str(endpoint) not in handle.launched_resources.ports: + port_set = resources_utils.port_ranges_to_set(handle.launched_resources.ports) + if endpoint not in port_set: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Port {endpoint} is not exposed ' 'on cluster ' From c713293aaa722de5b7841848ee44e215ce27552f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 6 Feb 2024 11:05:10 -0800 Subject: [PATCH 05/85] wip --- sky/core.py | 25 +++++++++++++++++++++++-- sky/serve/core.py | 4 ++-- sky/serve/replica_managers.py | 6 ++++-- sky/serve/serve_utils.py | 10 +++++++--- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/sky/core.py b/sky/core.py index fe6f196ff88..6b4533ecf42 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1007,7 +1007,28 @@ def storage_delete(name: str) -> None: _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' 'please retry after a while.') -def get_endpoints(cluster: str, endpoint: Optional[int]) -> Dict[int, provision_common.Endpoint]: +def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ + Union[str, Dict[int, provision_common.Endpoint]]: + """ + + Args: + cluster: + endpoint: + + Returns: Endpoint URL st if endpoint is not None, else a dictionary of all + + Raises: + ValueError: if the cluster is not UP or the endpoint is not exposed. + RuntimeError: if the cluster has no ports to be exposed or no endpoints + are exposed yet. + """ + # Cast endpoint to int if it is not None + if endpoint is not None: + try: + endpoint = int(endpoint) + except ValueError: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Invalid endpoint {endpoint!r}.') from None cluster_records = status(cluster_names=[cluster]) #TODO(romilb): Add error message for > 1 cluster records here before merging. cluster_record = cluster_records[0] @@ -1056,7 +1077,7 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> Dict[int, provision_ kubernetes_utils.get_endpoint_debug_message()) with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return {endpoint: port_details[endpoint][0].url()} + return port_details[endpoint][0].url() else: if not port_details: # If cluster had no ports to be exposed diff --git a/sky/serve/core.py b/sky/serve/core.py index ff44efc843c..89538cf709b 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -8,7 +8,7 @@ import sky from sky import backends -from sky import clouds +from sky import core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -233,7 +233,7 @@ def up( else: lb_port = serve_utils.load_service_initialization_result( lb_port_payload) - endpoint = f'{controller_handle.head_ip}:{lb_port}' + endpoint = core.get_endpoints(controller_handle.cluster_name, lb_port) sky_logging.print( f'{fore.CYAN}Service name: ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index ffc16c29c42..ecaf71818a7 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -16,7 +16,7 @@ import requests import sky -from sky import backends +from sky import backends, core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -370,7 +370,9 @@ def url(self) -> Optional[str]: handle = self.handle() if handle is None: return None - return f'{handle.head_ip}:{self.replica_port}' + endpoints = core.get_endpoints(handle.cluster_name, self.replica_port) + # TODO(romilb): Fix this type casting mess before merging. + return endpoints.get(int(self.replica_port), None) @property def status(self) -> serve_state.ReplicaStatus: diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 9e8176e08a6..7ac9ff916e5 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -19,7 +19,7 @@ import psutil import requests -from sky import backends +from sky import backends, core from sky import exceptions from sky import global_user_state from sky import status_lib @@ -672,12 +672,16 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: handle = global_user_state.get_handle_from_cluster_name( SKY_SERVE_CONTROLLER_NAME) assert isinstance(handle, backends.CloudVmRayResourceHandle) - if handle is None or handle.head_ip is None: + if handle is None: return '-' load_balancer_port = service_record['load_balancer_port'] if load_balancer_port is None: return '-' - return f'{handle.head_ip}:{load_balancer_port}' + try: + endpoint = core.get_endpoints(handle, load_balancer_port) + except RuntimeError: + return '-' + return endpoint.url() def format_service_table(service_records: List[Dict[str, Any]], From aa286e8069b334387d80c1590f5521230c9680fb Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 6 Feb 2024 12:30:47 -0800 Subject: [PATCH 06/85] fixed circular import --- sky/backends/backend_utils.py | 98 +++++++++++++ sky/cli.py | 1 - sky/core.py | 2 +- sky/serve/core.py | 3 +- sky/serve/replica_managers.py | 2 +- sky/serve/serve_utils.py | 7 +- .../kubernetes/generate_static_kubeconfig.sh | 137 ++++++++++++++++++ 7 files changed, 242 insertions(+), 8 deletions(-) create mode 100755 sky/utils/kubernetes/generate_static_kubeconfig.sh diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 7ba052bb7c2..3dd2a6fc27e 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -48,6 +48,7 @@ from sky.utils import controller_utils from sky.utils import env_options from sky.utils import rich_utils +from sky.utils import resources_utils from sky.utils import subprocess_utils from sky.utils import timeline from sky.utils import ux_utils @@ -2688,3 +2689,100 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, f'not interrupted): {colorama.Style.BRIGHT}sky start -f -y ' f'{cluster_name}{colorama.Style.RESET_ALL}' f'\n--- Details ---\n{stderr.strip()}\n') + +def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ + Union[str, Dict[int, str]]: + """ + + Args: + cluster: + endpoint: + + Returns: Endpoint URL if endpoint is not None, else a dictionary of all + + Raises: + ValueError: if the cluster is not UP or the endpoint is not exposed. + RuntimeError: if the cluster has no ports to be exposed or no endpoints + are exposed yet. + """ + + _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' + 'please retry after a while.') + + # Cast endpoint to int if it is not None + if endpoint is not None: + try: + endpoint = int(endpoint) + except ValueError: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Invalid endpoint {endpoint!r}.') from None + cluster_records = get_clusters(include_controller=True, + refresh=False, + cluster_names=[cluster]) + #TODO(romilb): Add error message for > 1 cluster records here before merging. + cluster_record = cluster_records[0] + if cluster_record['status'] != status_lib.ClusterStatus.UP: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' + 'is not in UP status.') + handle = cluster_record['handle'] + if not isinstance(handle, backends.CloudVmRayResourceHandle): + with ux_utils.print_exception_no_traceback(): + raise ValueError('Querying IP address is not supported ' + 'for local clusters.') + + launched_resources = handle.launched_resources + cloud = launched_resources.cloud + try: + cloud.check_features_are_supported( + launched_resources, + {clouds.CloudImplementationFeatures.OPEN_PORTS}) + except exceptions.NotSupportedError: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Querying endpoints is not supported ' + f'for {cloud}.') from None + + config = common_utils.read_yaml(handle.cluster_yaml) + port_details = provision_lib.query_ports( + repr(cloud), handle.cluster_name_on_cloud, + handle.launched_resources.ports, config['provider']) + + if endpoint is not None: + # If cluster had no ports to be exposed + port_set = resources_utils.port_ranges_to_set(handle.launched_resources.ports) + if endpoint not in port_set: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Port {endpoint} is not exposed ' + 'on cluster ' + f'{cluster_record["name"]!r}.') + # If the user requested a specific port endpoint + if endpoint not in port_details: + error_msg = (f'Port {endpoint} not exposed yet. ' + f'{_ENDPOINTS_RETRY_MESSAGE} ') + if handle.launched_resources.cloud.is_same_cloud( + clouds.Kubernetes()): + # Add Kubernetes specific debugging info + error_msg += ( + kubernetes_utils.get_endpoint_debug_message()) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(error_msg) + return port_details[endpoint][0].url() + else: + if not port_details: + # If cluster had no ports to be exposed + if handle.launched_resources.ports is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Cluster does not have any ports ' + 'to be exposed.') + # Else wait for the ports to be exposed + else: + error_msg = (f'No endpoints exposed yet. ' + f'{_ENDPOINTS_RETRY_MESSAGE} ') + if handle.launched_resources.cloud.is_same_cloud( + clouds.Kubernetes()): + # Add Kubernetes specific debugging info + error_msg += \ + kubernetes_utils.get_endpoint_debug_message() + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(error_msg) + return {port: urls[0].url() for port, urls in port_details.items()} \ No newline at end of file diff --git a/sky/cli.py b/sky/cli.py index 25664a54bc0..371274d4e15 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,7 +51,6 @@ from sky import core from sky import exceptions from sky import global_user_state -from sky import provision as provision_lib from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib diff --git a/sky/core.py b/sky/core.py index 6b4533ecf42..61f336308ac 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1015,7 +1015,7 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ cluster: endpoint: - Returns: Endpoint URL st if endpoint is not None, else a dictionary of all + Returns: Endpoint URL if endpoint is not None, else a dictionary of all Raises: ValueError: if the cluster is not UP or the endpoint is not exposed. diff --git a/sky/serve/core.py b/sky/serve/core.py index 89538cf709b..6a835ea35f0 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -8,7 +8,6 @@ import sky from sky import backends -from sky import core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -233,7 +232,7 @@ def up( else: lb_port = serve_utils.load_service_initialization_result( lb_port_payload) - endpoint = core.get_endpoints(controller_handle.cluster_name, lb_port) + endpoint = backend_utils.get_endpoints(controller_handle.cluster_name, lb_port) sky_logging.print( f'{fore.CYAN}Service name: ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index ecaf71818a7..9bdc86a8ca0 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -370,7 +370,7 @@ def url(self) -> Optional[str]: handle = self.handle() if handle is None: return None - endpoints = core.get_endpoints(handle.cluster_name, self.replica_port) + endpoints = backend_utils.get_endpoints(handle.cluster_name, self.replica_port) # TODO(romilb): Fix this type casting mess before merging. return endpoints.get(int(self.replica_port), None) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 7ac9ff916e5..bf2ed895eb7 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -19,10 +19,11 @@ import psutil import requests -from sky import backends, core +from sky import backends from sky import exceptions from sky import global_user_state from sky import status_lib +from sky.backends import backend_utils from sky.serve import constants from sky.serve import serve_state from sky.skylet import constants as skylet_constants @@ -678,10 +679,10 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = core.get_endpoints(handle, load_balancer_port) + endpoint = backend_utils.get_endpoints(handle.cluster_name, load_balancer_port) except RuntimeError: return '-' - return endpoint.url() + return endpoint def format_service_table(service_records: List[Dict[str, Any]], diff --git a/sky/utils/kubernetes/generate_static_kubeconfig.sh b/sky/utils/kubernetes/generate_static_kubeconfig.sh new file mode 100755 index 00000000000..30ea929177a --- /dev/null +++ b/sky/utils/kubernetes/generate_static_kubeconfig.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# This script creates a new k8s Service Account and generates a kubeconfig with +# its credentials. This Service Account has all the necessary permissions for +# SkyPilot. The kubeconfig is written in the current directory. +# +# You must configure your local kubectl to point to the right k8s cluster and +# have admin-level access. +# +# Note: all of the k8s resources are created in namespace "skypilot". If you +# delete any of these objects, SkyPilot will stop working. +# +# You can override the default namespace "skypilot" using the +# SKYPILOT_NAMESPACE environment variable. +# You can override the default service account name "skypilot-sa" using the +# SKYPILOT_SA_NAME environment variable. + +set -eu -o pipefail + +# Allow passing in common name and username in environment. If not provided, +# use default. +SKYPILOT_SA=${SKYPILOT_SA_NAME:-skypilot-sa} +NAMESPACE=${SKYPILOT_NAMESPACE:-default} + +# Set OS specific values. +if [[ "$OSTYPE" == "linux-gnu" ]]; then + BASE64_DECODE_FLAG="-d" +elif [[ "$OSTYPE" == "darwin"* ]]; then + BASE64_DECODE_FLAG="-D" +elif [[ "$OSTYPE" == "linux-musl" ]]; then + BASE64_DECODE_FLAG="-d" +else + echo "Unknown OS ${OSTYPE}" + exit 1 +fi + +echo "Creating the Kubernetes Service Account with minimal RBAC permissions." +kubectl apply -f - < kubeconfig < Date: Tue, 6 Feb 2024 15:40:32 -0800 Subject: [PATCH 07/85] Working for ingress and loadbalancer svc --- sky/cli.py | 11 ++-- sky/core.py | 94 ----------------------------------- sky/serve/constants.py | 2 +- sky/serve/load_balancer.py | 2 +- sky/serve/replica_managers.py | 29 ++++++++--- sky/serve/serve_utils.py | 7 ++- 6 files changed, 34 insertions(+), 111 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 371274d4e15..fff0cd262c8 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1990,12 +1990,13 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, head_ip = handle.external_ips()[0] if show_endpoints: - query_endpoint = endpoint # Either port number of none if endpoints is ued - cluster_endpoints = core.get_endpoints(cluster_record['name'], - query_endpoint) - if query_endpoint: - click.echo(cluster_endpoints[endpoint]) + if endpoint: + cluster_endpoint = backend_utils.get_endpoints(cluster_record['name'], + endpoint) + click.echo(cluster_endpoint) else: + cluster_endpoints = backend_utils.get_endpoints( + cluster_record['name']) for port, port_endpoint in cluster_endpoints.items(): click.echo( f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' diff --git a/sky/core.py b/sky/core.py index 61f336308ac..e0c392a5cca 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1003,97 +1003,3 @@ def storage_delete(name: str) -> None: source=handle.source, sync_on_reconstruction=False) store_object.delete() - -_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' - 'please retry after a while.') - -def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ - Union[str, Dict[int, provision_common.Endpoint]]: - """ - - Args: - cluster: - endpoint: - - Returns: Endpoint URL if endpoint is not None, else a dictionary of all - - Raises: - ValueError: if the cluster is not UP or the endpoint is not exposed. - RuntimeError: if the cluster has no ports to be exposed or no endpoints - are exposed yet. - """ - # Cast endpoint to int if it is not None - if endpoint is not None: - try: - endpoint = int(endpoint) - except ValueError: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Invalid endpoint {endpoint!r}.') from None - cluster_records = status(cluster_names=[cluster]) - #TODO(romilb): Add error message for > 1 cluster records here before merging. - cluster_record = cluster_records[0] - if cluster_record['status'] != status_lib.ClusterStatus.UP: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' - 'is not in UP status.') - handle = cluster_record['handle'] - if not isinstance(handle, backends.CloudVmRayResourceHandle): - with ux_utils.print_exception_no_traceback(): - raise ValueError('Querying IP address is not supported ' - 'for local clusters.') - - launched_resources = handle.launched_resources - cloud = launched_resources.cloud - try: - cloud.check_features_are_supported( - launched_resources, - {clouds.CloudImplementationFeatures.OPEN_PORTS}) - except exceptions.NotSupportedError: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Querying endpoints is not supported ' - f'for {cloud}.') from None - - config = common_utils.read_yaml(handle.cluster_yaml) - port_details = provision_lib.query_ports( - repr(cloud), handle.cluster_name_on_cloud, - handle.launched_resources.ports, config['provider']) - - if endpoint is not None: - # If cluster had no ports to be exposed - port_set = resources_utils.port_ranges_to_set(handle.launched_resources.ports) - if endpoint not in port_set: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Port {endpoint} is not exposed ' - 'on cluster ' - f'{cluster_record["name"]!r}.') - # If the user requested a specific port endpoint - if endpoint not in port_details: - error_msg = (f'Port {endpoint} not exposed yet. ' - f'{_ENDPOINTS_RETRY_MESSAGE} ') - if handle.launched_resources.cloud.is_same_cloud( - clouds.Kubernetes()): - # Add Kubernetes specific debugging info - error_msg += ( - kubernetes_utils.get_endpoint_debug_message()) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) - return port_details[endpoint][0].url() - else: - if not port_details: - # If cluster had no ports to be exposed - if handle.launched_resources.ports is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Cluster does not have any ports ' - 'to be exposed.') - # Else wait for the ports to be exposed - else: - error_msg = (f'No endpoints exposed yet. ' - f'{_ENDPOINTS_RETRY_MESSAGE} ') - if handle.launched_resources.cloud.is_same_cloud( - clouds.Kubernetes()): - # Add Kubernetes specific debugging info - error_msg += \ - kubernetes_utils.get_endpoint_debug_message() - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) - return {port: urls[0].url() for port, urls in port_details.items()} \ No newline at end of file diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 5b8fa8f206a..b1698157320 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -62,7 +62,7 @@ # automatically generated from this start port. CONTROLLER_PORT_START = 20001 LOAD_BALANCER_PORT_START = 30001 -LOAD_BALANCER_PORT_RANGE = '30001-30100' +LOAD_BALANCER_PORT_RANGE = '30001-30020' # Initial version of service. INITIAL_VERSION = 1 diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index b3b8fe5403e..a96fc648470 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -87,7 +87,7 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_NAME]" ' 'to check the replica status.') - path = f'http://{ready_replica_url}{request.url.path}' + path = f'{ready_replica_url}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 9bdc86a8ca0..04687efd1e4 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -370,9 +370,11 @@ def url(self) -> Optional[str]: handle = self.handle() if handle is None: return None - endpoints = backend_utils.get_endpoints(handle.cluster_name, self.replica_port) - # TODO(romilb): Fix this type casting mess before merging. - return endpoints.get(int(self.replica_port), None) + try: + endpoint = backend_utils.get_endpoints(handle.cluster_name, self.replica_port) + return endpoint + except RuntimeError: + return None @property def status(self) -> serve_state.ReplicaStatus: @@ -390,6 +392,7 @@ def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: 'name': self.cluster_name, 'status': self.status, 'version': self.version, + 'endpoint': self.url, 'launched_at': (cluster_record['launched_at'] if cluster_record is not None else None), } @@ -415,7 +418,15 @@ def probe( try: msg = '' # TODO(tian): Support HTTPS in the future. - readiness_path = (f'http://{self.url}{readiness_path}') + url = self.url + if url is None: + logger.info(f'Error when probing {replica_identity}: ' + 'Cannot get the endpoint.') + return self, False, probe_time + elif not url.startswith('http://'): + url = f'http://{url}' + readiness_path = (f'{url}{readiness_path}') + logger.info(f'Probing {replica_identity} with {readiness_path}.') if post_data is not None: msg += 'POST' response = requests.post( @@ -523,10 +534,16 @@ def get_ready_replica_urls(self) -> List[str]: ready_replica_urls = [] version2url = collections.defaultdict(list) for info in serve_state.get_replica_infos(self._service_name): + url = info.url + if not url: + # If URL is None, IP may not be ready. Skip for now. + continue + if not url.startswith('http://'): + url = f'http://{url}' if info.status == serve_state.ReplicaStatus.READY: assert info.url is not None - version2url[info.version].append(info.url) - ready_replica_urls.append(info.url) + version2url[info.version].append(url) + ready_replica_urls.append(url) # Try all version in descending order. There is possibility that # user consecutively update the service several times, and some # version might not have any ready replicas. diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index bf2ed895eb7..011e1a98824 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -744,7 +744,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], return 'No existing replicas.' replica_columns = [ - 'SERVICE_NAME', 'ID', 'VERSION', 'IP', 'LAUNCHED', 'RESOURCES', + 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'RESOURCES', 'STATUS', 'REGION' ] if show_all: @@ -758,10 +758,11 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], replica_records = replica_records[:_REPLICA_TRUNC_NUM] for record in replica_records: + endpoint = record.get('endpoint', None) service_name = record['service_name'] replica_id = record['replica_id'] version = (record['version'] if 'version' in record else '-') - replica_ip = '-' + replica_ip = endpoint if endpoint else '-' launched_at = log_utils.readable_time_duration(record['launched_at']) resources_str = '-' replica_status = record['status'] @@ -771,8 +772,6 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], replica_handle: 'backends.CloudVmRayResourceHandle' = record['handle'] if replica_handle is not None: - if replica_handle.head_ip is not None: - replica_ip = replica_handle.head_ip resources_str = resources_utils.get_readable_resources_repr( replica_handle, simplify=not show_all) if replica_handle.launched_resources.region is not None: From 1848b3a56b7c2327db585bb9279a93a4e27caf6c Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 6 Feb 2024 16:44:25 -0800 Subject: [PATCH 08/85] lint --- sky/adaptors/kubernetes.py | 2 +- sky/backends/backend_utils.py | 40 +++++++++++++++++---------------- sky/cli.py | 5 +++-- sky/clouds/kubernetes.py | 15 +++++-------- sky/core.py | 6 ----- sky/provision/aws/instance.py | 15 ++++--------- sky/provision/azure/__init__.py | 1 - sky/provision/azure/instance.py | 11 --------- sky/provision/common.py | 16 +++++++------ sky/provision/gcp/instance.py | 7 ++++-- sky/serve/core.py | 12 +++++----- sky/serve/replica_managers.py | 6 +++-- sky/serve/serve_utils.py | 4 +++- sky/utils/controller_utils.py | 7 +++--- 14 files changed, 64 insertions(+), 83 deletions(-) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index d7a856b73a6..131638d15ea 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -64,7 +64,7 @@ def _load_config(): os.environ['KUBERNETES_SERVICE_PORT'] = '443' # TODO(romilb) - THIS IS HACK! FIX THIS BEFORE MERGING. raise kubernetes.config.config_exception.ConfigException - kubernetes.config.load_incluster_config() + # kubernetes.config.load_incluster_config() except kubernetes.config.config_exception.ConfigException: try: kubernetes.config.load_kube_config() diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 3dd2a6fc27e..4421429141f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -47,8 +47,8 @@ from sky.utils import common_utils from sky.utils import controller_utils from sky.utils import env_options -from sky.utils import rich_utils from sky.utils import resources_utils +from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import timeline from sky.utils import ux_utils @@ -109,6 +109,9 @@ # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' +_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' + 'please retry after a while.') + # Include the fields that will be used for generating tags that distinguishes # the cluster in ray, to avoid the stopped cluster being discarded due to # updates in the yaml template. @@ -2690,13 +2693,16 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, f'{cluster_name}{colorama.Style.RESET_ALL}' f'\n--- Details ---\n{stderr.strip()}\n') -def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ - Union[str, Dict[int, str]]: - """ + +def get_endpoints( + cluster: str, + endpoint: Optional[Union[int, + str]] = None) -> Union[str, Dict[int, str]]: + """Gets the endpoint for a given cluster and port number (endpoint). Args: - cluster: - endpoint: + cluster: The name of the cluster. + endpoint: The port number to get the endpoint for. If None, all Returns: Endpoint URL if endpoint is not None, else a dictionary of all @@ -2705,10 +2711,6 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - - _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' - 'please retry after a while.') - # Cast endpoint to int if it is not None if endpoint is not None: try: @@ -2735,21 +2737,22 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ cloud = launched_resources.cloud try: cloud.check_features_are_supported( - launched_resources, - {clouds.CloudImplementationFeatures.OPEN_PORTS}) + launched_resources, {clouds.CloudImplementationFeatures.OPEN_PORTS}) except exceptions.NotSupportedError: with ux_utils.print_exception_no_traceback(): raise ValueError('Querying endpoints is not supported ' f'for {cloud}.') from None config = common_utils.read_yaml(handle.cluster_yaml) - port_details = provision_lib.query_ports( - repr(cloud), handle.cluster_name_on_cloud, - handle.launched_resources.ports, config['provider']) + port_details = provision_lib.query_ports(repr(cloud), + handle.cluster_name_on_cloud, + handle.launched_resources.ports, + config['provider']) if endpoint is not None: # If cluster had no ports to be exposed - port_set = resources_utils.port_ranges_to_set(handle.launched_resources.ports) + port_set = resources_utils.port_ranges_to_set( + handle.launched_resources.ports) if endpoint not in port_set: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Port {endpoint} is not exposed ' @@ -2762,8 +2765,7 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ if handle.launched_resources.cloud.is_same_cloud( clouds.Kubernetes()): # Add Kubernetes specific debugging info - error_msg += ( - kubernetes_utils.get_endpoint_debug_message()) + error_msg += (kubernetes_utils.get_endpoint_debug_message()) with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) return port_details[endpoint][0].url() @@ -2785,4 +2787,4 @@ def get_endpoints(cluster: str, endpoint: Optional[int]) -> \ kubernetes_utils.get_endpoint_debug_message() with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return {port: urls[0].url() for port, urls in port_details.items()} \ No newline at end of file + return {port: urls[0].url() for port, urls in port_details.items()} diff --git a/sky/cli.py b/sky/cli.py index fff0cd262c8..9928dd31c0d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1991,12 +1991,13 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = backend_utils.get_endpoints(cluster_record['name'], - endpoint) + cluster_endpoint = backend_utils.get_endpoints( + cluster_record['name'], endpoint) click.echo(cluster_endpoint) else: cluster_endpoints = backend_utils.get_endpoints( cluster_record['name']) + assert isinstance(cluster_endpoints, dict) for port, port_endpoint in cluster_endpoints.items(): click.echo( f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 295ac6d12da..2c346dbd6fc 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -341,17 +341,12 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: f'check if {CREDENTIAL_PATH} exists.') def get_credential_file_mounts(self) -> Dict[str, str]: - # Okay, I need to - # 1) Generate a SA with the right permissions - # 2) Generate a kubeconfig file with the SA token - # 3) Mount the kubeconfig file into the container - # 4) Additionally, check in the container during auth if the API server IP is reachable. If not, resolve it using the service DNS and use that as the IP. - # 5) Use the same sky-key for all containers in the pod. return { - # TODO(romilb): This is a hack! Fix this before merging. - '~/.ssh/sky-key': '~/.ssh/sky-key', - '~/.ssh/sky-key.pub': '~/.ssh/sky-key.pub', - CREDENTIAL_PATH: CREDENTIAL_PATH} + # TODO(romilb): Fix before merging. + '~/.ssh/sky-key': '~/.ssh/sky-key', + '~/.ssh/sky-key.pub': '~/.ssh/sky-key.pub', + CREDENTIAL_PATH: CREDENTIAL_PATH + } def instance_type_exists(self, instance_type: str) -> bool: return kubernetes_utils.KubernetesInstanceType.is_valid_instance_type( diff --git a/sky/core.py b/sky/core.py index e0c392a5cca..d7174e1522e 100644 --- a/sky/core.py +++ b/sky/core.py @@ -12,21 +12,15 @@ from sky import data from sky import exceptions from sky import global_user_state -from sky import provision as provision_lib from sky import sky_logging from sky import spot from sky import status_lib from sky import task from sky.backends import backend_utils -from sky.provision import common as provision_common -# TODO(romilb): This is a bad import - refactor to avoid this. -from sky.provision.kubernetes import utils as kubernetes_utils from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib -from sky.utils import common_utils from sky.utils import controller_utils -from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import ux_utils diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index af4ecc03cca..2d711c632bc 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -817,15 +817,8 @@ def query_ports( provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" + assert provider_config is not None cluster_info = get_cluster_info(provider_config['region'], - cluster_name_on_cloud, - provider_config=provider_config) - head_instance = cluster_info.instances.get(cluster_info.head_instance_id) - if head_instance is None: - return {} - head_ip = head_instance[0].external_ip - ports = list(resources_utils.port_ranges_to_set(ports)) - result: Dict[int, List[common.Endpoint]] = {} - for port in ports: - result[port] = [common.SocketEndpoint(port=port, host=head_ip)] - return result \ No newline at end of file + cluster_name_on_cloud, + provider_config=provider_config) + return common.query_ports_passthrough(ports, cluster_info) diff --git a/sky/provision/azure/__init__.py b/sky/provision/azure/__init__.py index 9c87fc907db..b83dbb462d9 100644 --- a/sky/provision/azure/__init__.py +++ b/sky/provision/azure/__init__.py @@ -2,4 +2,3 @@ from sky.provision.azure.instance import cleanup_ports from sky.provision.azure.instance import open_ports -from sky.provision.azure.instance import query_ports diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index 9bf8196fd34..c316e87f55e 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -3,7 +3,6 @@ from sky import sky_logging from sky.adaptors import azure -from sky.provision import common from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) @@ -88,13 +87,3 @@ def cleanup_ports( # Azure will automatically cleanup network security groups when cleanup # resource group. So we don't need to do anything here. del cluster_name_on_cloud, ports, provider_config # Unused. - - -def query_ports( - cluster_name_on_cloud: str, - ports: List[str], - provider_config: Optional[Dict[str, Any]] = None, -) -> Dict[int, List[common.Endpoint]]: - """See sky/provision/__init__.py""" - return common.query_ports_passthrough(cluster_name_on_cloud, ports, - provider_config) diff --git a/sky/provision/common.py b/sky/provision/common.py index 5c8afde769b..32c5c3abbfb 100644 --- a/sky/provision/common.py +++ b/sky/provision/common.py @@ -4,7 +4,7 @@ import os from typing import Any, Dict, List, Optional, Tuple -from sky.utils.resources_utils import port_ranges_to_set +from sky.utils import resources_utils # NOTE: we can use pydantic instead of dataclasses or namedtuples, because # pydantic provides more features like validation or parsing from @@ -237,17 +237,19 @@ def url(self, override_ip: Optional[str] = None) -> str: def query_ports_passthrough( - cluster_name_on_cloud: str, ports: List[str], - provider_config: Optional[Dict[str, Any]] = None, + cluster_info: ClusterInfo, ) -> Dict[int, List[Endpoint]]: """Common function to query ports for AWS, GCP and Azure. Returns a list of socket endpoint with empty host and the input ports.""" - del cluster_name_on_cloud, provider_config # Unused. - handle = cluster_record['handle'] - head_ip = handle.external_ips()[0] - ports = list(port_ranges_to_set(ports)) + assert cluster_info.head_instance_id is not None, cluster_info + head_instance = cluster_info.instances.get(cluster_info.head_instance_id) + if head_instance is None: + return {} + head_ip = head_instance[0].external_ip + assert head_ip is not None, head_instance + ports = list(resources_utils.port_ranges_to_set(ports)) result: Dict[int, List[Endpoint]] = {} for port in ports: result[port] = [SocketEndpoint(port=port, host=head_ip)] diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py index e7f69f8c6eb..20ef0b6c78d 100644 --- a/sky/provision/gcp/instance.py +++ b/sky/provision/gcp/instance.py @@ -623,5 +623,8 @@ def query_ports( provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" - return common.query_ports_passthrough(cluster_name_on_cloud, ports, - provider_config) + assert provider_config is not None + cluster_info = get_cluster_info(provider_config['region'], + cluster_name_on_cloud, + provider_config=provider_config) + return common.query_ports_passthrough(ports, cluster_info) diff --git a/sky/serve/core.py b/sky/serve/core.py index 6a835ea35f0..40c098c6843 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -1,13 +1,13 @@ """SkyServe core APIs.""" import re import tempfile -import typing from typing import Any, Dict, List, Optional, Union import colorama import sky from sky import backends +from sky import clouds from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -25,9 +25,6 @@ from sky.utils import subprocess_utils from sky.utils import ux_utils -if typing.TYPE_CHECKING: - from sky import clouds - @usage_lib.entrypoint def up( @@ -164,8 +161,8 @@ def up( # with the current job id, we know the service is up and running # for the first time; otherwise it is a name conflict. idle_minutes_to_autostop = None if ( - controller_cloud and - controller_cloud.is_same_cloud(clouds.Kubernetes()) + controller_cloud and + controller_cloud.is_same_cloud(clouds.Kubernetes()) ) else constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP controller_job_id, controller_handle = execution.execute( entrypoint=controller_task, @@ -232,7 +229,8 @@ def up( else: lb_port = serve_utils.load_service_initialization_result( lb_port_payload) - endpoint = backend_utils.get_endpoints(controller_handle.cluster_name, lb_port) + endpoint = backend_utils.get_endpoints( + controller_handle.cluster_name, lb_port) sky_logging.print( f'{fore.CYAN}Service name: ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 04687efd1e4..105a04aecc8 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -16,7 +16,7 @@ import requests import sky -from sky import backends, core +from sky import backends from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -371,7 +371,9 @@ def url(self) -> Optional[str]: if handle is None: return None try: - endpoint = backend_utils.get_endpoints(handle.cluster_name, self.replica_port) + endpoint = backend_utils.get_endpoints(handle.cluster_name, + self.replica_port) + assert isinstance(endpoint, str) return endpoint except RuntimeError: return None diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 011e1a98824..65667baf9ff 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -679,7 +679,9 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = backend_utils.get_endpoints(handle.cluster_name, load_balancer_port) + endpoint = backend_utils.get_endpoints(handle.cluster_name, + load_balancer_port) + assert isinstance(endpoint, str) except RuntimeError: return '-' return endpoint diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 27bc9e4e48a..8c2470eb461 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -168,9 +168,10 @@ def _get_cloud_dependencies_installation_commands( # Install k8s + skypilot dependencies 'sudo bash -c "apt update && apt install curl socat netcat -y" && ' # Install kubectl - 'curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' - 'sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && ' - ) + 'curl -LO "https://dl.k8s.io/release/$(curl -L -s ' + 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' + 'sudo install -o root -g root -m 0755 ' + 'kubectl /usr/local/bin/kubectl && ') return commands From ba0ff60a81e812668e3b97cede97d39de7c89cca Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 14 Feb 2024 17:32:14 -0800 Subject: [PATCH 09/85] add purging from #3094 --- sky/cli.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 9928dd31c0d..d2ad03a0762 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3076,9 +3076,19 @@ def _down_or_stop_clusters( controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] - hint_or_raise(controller_name) + try: + hint_or_raise(controller_name) + except exceptions.ClusterOwnerIdentityMismatchError as e: + if purge: + click.echo(common_utils.format_exception(e)) + else: + raise confirm_str = 'delete' + input_prefix = ('Since --purge is set, errors will be ignored ' + 'and controller will be removed from ' + 'local state.\n') if purge else '' user_input = click.prompt( + f'{input_prefix}' f'To proceed, please type {colorama.Style.BRIGHT}' f'{confirm_str!r}{colorama.Style.RESET_ALL}', type=str) From 13f7241914ae36c9252b7e077fbda072d8f4e40b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 14 Feb 2024 22:35:10 -0800 Subject: [PATCH 10/85] Use local catalog on the controller too --- sky/templates/sky-serve-controller.yaml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index d49412fb9cd..179cd2a7efb 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -17,6 +17,7 @@ setup: | file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} {{remote_user_config_path}}: skypilot:local_skypilot_config_path + ~/.sky/catalogs: ~/.sky/catalogs run: | # Start sky serve service. From 996742aba02ad19f13598fda6bd1eabf00cb07aa Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 19 Feb 2024 12:58:23 -0800 Subject: [PATCH 11/85] use externalip if available --- sky/provision/kubernetes/network_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index c3fe0d24c9a..b02b729d7a1 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -182,11 +182,17 @@ def get_ingress_external_ip_and_ports( ingress_service = ingress_services[0] if ingress_service.status.load_balancer.ingress is None: + # Try to use assigned external IP if it exists, + # otherwise return 'localhost' + if ingress_service.spec.external_i_ps is not None: + ip = ingress_service.spec.external_i_ps[0] + else: + ip = 'localhost' ports = ingress_service.spec.ports http_port = [port for port in ports if port.name == 'http'][0].node_port https_port = [port for port in ports if port.name == 'https' ][0].node_port - return 'localhost', (int(http_port), int(https_port)) + return ip, (int(http_port), int(https_port)) external_ip = ingress_service.status.load_balancer.ingress[ 0].ip or ingress_service.status.load_balancer.ingress[0].hostname From d852c614238e95a251cd7016cf4f336a4bd94b00 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 23 Feb 2024 14:02:43 -0800 Subject: [PATCH 12/85] add dshm_size_limit --- docs/source/reference/config.rst | 7 +++++++ sky/clouds/kubernetes.py | 4 ++++ sky/templates/kubernetes-ray.yml.j2 | 3 +++ sky/utils/schemas.py | 3 +++ 4 files changed, 17 insertions(+) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 5d48870c6ab..62e441fd8f3 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -287,6 +287,13 @@ Available fields and semantics: path: /tmp type: Directory + # Size of the /dev/shm shared memory for the pod (optional). + # + # Defaults to None, which means no size limits are set. If set, the value + # must be a string that is a valid Kubernetes quantity, e.g., "3Gi". + # https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/quantity/ + dshm_size_limit: 3Gi + # Advanced OCI configurations (optional). oci: # A dict mapping region names to region-specific configurations, or diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index a1cbb2a5c09..b45c41a3d23 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -6,6 +6,7 @@ from sky import clouds from sky import sky_logging +from sky import skypilot_config from sky.adaptors import kubernetes from sky.clouds import service_catalog from sky.provision.kubernetes import network_utils @@ -250,6 +251,8 @@ def make_deploy_resources_variables( port_mode = network_utils.get_port_mode(None) + dshm_size_limit = skypilot_config.get_nested(('kubernetes', 'dshm_size_limit'), None) + deploy_vars = { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, @@ -266,6 +269,7 @@ def make_deploy_resources_variables( 'k8s_acc_label_value': k8s_acc_label_value, 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, 'k8s_ssh_jump_image': ssh_jump_image, + 'k8s_dshm_size_limit': dshm_size_limit, # TODO(romilb): Allow user to specify custom images 'image_id': image_id, } diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 28d03230645..e5d1da030fd 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -162,6 +162,9 @@ available_node_types: - name: dshm emptyDir: medium: Memory + {% if k8s_dshm_size_limit is not none %} + sizeLimit: {{k8s_dshm_size_limit}} + {% endif %} - name: dev-fuse # Required for fuse mounting hostPath: path: /dev/fuse diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 1d91e77c4d2..8b29d24381d 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -603,6 +603,9 @@ def get_config_schema(): 'required': [], # Allow arbitrary keys since validating pod spec is hard 'additionalProperties': True, + }, + 'dshm_size_limit': { + 'type': 'string' } } }, From b9014df8fd160fd058d6b462482cb846da38b24a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 23 Feb 2024 15:26:19 -0800 Subject: [PATCH 13/85] optimize dependency installation --- sky/utils/controller_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 5237f221f19..cc87bbe97a2 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -166,12 +166,14 @@ def _get_cloud_dependencies_installation_commands( for cloud in global_user_state.get_enabled_clouds()): commands.append( # Install k8s + skypilot dependencies - 'sudo bash -c "apt update && apt install curl socat netcat -y" && ' + 'sudo bash -c "if ' + '! command -v curl &> /dev/null || ' + '! command -v socat &> /dev/null || ' + '! command -v netcat &> /dev/null; ' + 'then apt update && apt install curl socat netcat -y; ' + 'fi" && ' # Install kubectl - 'curl -LO "https://dl.k8s.io/release/$(curl -L -s ' - 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' - 'sudo install -o root -g root -m 0755 ' - 'kubectl /usr/local/bin/kubectl && ') + '(command -v kubectl &>/dev/null || (curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl)) && ') return commands From 282bcef546d07451802b886a144a21f0cb0a5001 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 23 Feb 2024 16:23:07 -0800 Subject: [PATCH 14/85] Add todo --- sky/serve/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/serve/core.py b/sky/serve/core.py index 003395936dd..67992215ab3 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -156,6 +156,9 @@ def up( # whether the service is already running. If the id is the same # with the current job id, we know the service is up and running # for the first time; otherwise it is a name conflict. + # TODO(romilb): THIS NEEDS TO BE FIXED. If the user doesn't specify a cloud + # for the controller in their ~/.sky/config.yaml, this idle_minutes_to_autostop + # will be None and provisioning would fail on k8s with unsupported feature error. idle_minutes_to_autostop = None if ( controller_cloud and controller_cloud.is_same_cloud(clouds.Kubernetes()) From 3cb948fc1b8b506dff9f14b94fa992d305896b0a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 1 Mar 2024 23:17:08 -0800 Subject: [PATCH 15/85] optimize ingress --- sky/provision/kubernetes/network.py | 59 +++++++++++++---------- sky/provision/kubernetes/network_utils.py | 23 ++++++--- sky/templates/kubernetes-ingress.yml.j2 | 39 ++++++++------- 3 files changed, 72 insertions(+), 49 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index dcc93145f71..4ab9adad185 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -57,6 +57,7 @@ def _open_ports_using_ingress( ports: List[int], provider_config: Dict[str, Any], ) -> None: + # Check if an ingress controller exists if not network_utils.ingress_controller_exists(): raise Exception( 'Ingress controller not found. ' @@ -64,32 +65,37 @@ def _open_ports_using_ingress( 'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long ) - for port in ports: - service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}' - ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}' - path_prefix = _PATH_PREFIX.format( - cluster_name_on_cloud=cluster_name_on_cloud, port=port) + # Prepare service names, ports, for template rendering + service_details = [ + (f'{cluster_name_on_cloud}-skypilot-service--{port}', port, + _PATH_PREFIX.format(cluster_name_on_cloud=cluster_name_on_cloud, + port=port)) for port in ports + ] - content = network_utils.fill_ingress_template( - namespace=provider_config.get('namespace', 'default'), - path_prefix=path_prefix, - service_name=service_name, - service_port=port, - ingress_name=ingress_name, - selector_key='skypilot-cluster', - selector_value=cluster_name_on_cloud, - ) + # Generate ingress and services specs + content = network_utils.fill_ingress_template( + namespace=provider_config.get('namespace', 'default'), + service_details=service_details, + ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress', + selector_key='skypilot-cluster', + selector_value=cluster_name_on_cloud, + ) + + # Create or update services based on the generated specs + for service_name, service_spec in content['services_spec'].items(): network_utils.create_or_replace_namespaced_service( namespace=provider_config.get('namespace', 'default'), service_name=service_name, - service_spec=content['service_spec'], - ) - network_utils.create_or_replace_namespaced_ingress( - namespace=provider_config.get('namespace', 'default'), - ingress_name=ingress_name, - ingress_spec=content['ingress_spec'], + service_spec=service_spec, ) + # Create or update the single ingress for all services + network_utils.create_or_replace_namespaced_ingress( + namespace=provider_config.get('namespace', 'default'), + ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress', + ingress_spec=content['ingress_spec'], + ) + def cleanup_ports( cluster_name_on_cloud: str, @@ -128,17 +134,20 @@ def _cleanup_ports_for_ingress( ports: List[int], provider_config: Dict[str, Any], ) -> None: + # Delete services for each port for port in ports: service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}' - ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}' network_utils.delete_namespaced_service( namespace=provider_config.get('namespace', 'default'), service_name=service_name, ) - network_utils.delete_namespaced_ingress( - namespace=provider_config.get('namespace', 'default'), - ingress_name=ingress_name, - ) + + # Delete the single ingress used for all ports + ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress' + network_utils.delete_namespaced_ingress( + namespace=provider_config.get('namespace', 'default'), + ingress_name=ingress_name, + ) def query_ports( diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index db90d7f7f58..eb2475ee935 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -57,9 +57,10 @@ def fill_loadbalancer_template(namespace: str, service_name: str, return content -def fill_ingress_template(namespace: str, path_prefix: str, service_name: str, - service_port: int, ingress_name: str, - selector_key: str, selector_value: str) -> Dict: +def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int, + str]], + ingress_name: str, selector_key: str, + selector_value: str) -> Dict: template_path = os.path.join(sky.__root_dir__, 'templates', _INGRESS_TEMPLATE_NAME) if not os.path.exists(template_path): @@ -70,15 +71,23 @@ def fill_ingress_template(namespace: str, path_prefix: str, service_name: str, j2_template = jinja2.Template(template) cont = j2_template.render( namespace=namespace, - path_prefix=path_prefix.rstrip('/').lstrip('/'), - service_name=service_name, - service_port=service_port, + service_names_and_ports=[{ + 'service_name': name, + 'service_port': port, + 'path_prefix': path_prefix + } for name, port, path_prefix in service_details], ingress_name=ingress_name, selector_key=selector_key, selector_value=selector_value, ) content = yaml.safe_load(cont) - return content + + # Separate the ingress_spec and services_spec from the content + ingress_spec = content['ingress_spec'] + services_spec = {k: v for k, v in content.items() if k != 'ingress_spec'} + + # Return a dictionary containing both specs + return {'ingress_spec': ingress_spec, 'services_spec': services_spec} def create_or_replace_namespaced_ingress( diff --git a/sky/templates/kubernetes-ingress.yml.j2 b/sky/templates/kubernetes-ingress.yml.j2 index 1a1dcbcafc5..84b3e9d6998 100644 --- a/sky/templates/kubernetes-ingress.yml.j2 +++ b/sky/templates/kubernetes-ingress.yml.j2 @@ -12,24 +12,29 @@ ingress_spec: rules: - http: paths: - - path: /{{ path_prefix }}(/|$)(.*) + {% for service in service_names_and_ports %} + - path: /{{ service.path_prefix }}(/|$)(.*) pathType: ImplementationSpecific backend: service: - name: {{ service_name }} + name: {{ service.service_name }} port: - number: {{ service_port }} -service_spec: - apiVersion: v1 - kind: Service - metadata: - name: {{ service_name }} - labels: - parent: skypilot - spec: - type: ClusterIP - selector: - {{ selector_key }}: {{ selector_value }} - ports: - - port: {{ service_port }} - targetPort: {{ service_port }} + number: {{ service.service_port }} + {% endfor %} +services_spec: + {% for service in service_names_and_ports %} + {{ service.service_name }}: + apiVersion: v1 + kind: Service + metadata: + name: {{ service.service_name }} + labels: + parent: skypilot + spec: + type: ClusterIP + selector: + {{ selector_key }}: {{ selector_value }} + ports: + - port: {{ service.service_port }} + targetPort: {{ service.service_port }} + {% endfor %} From 4a7e10abdb1c64369616d9a22b5e4c8c75ada7e0 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 1 Mar 2024 23:39:40 -0800 Subject: [PATCH 16/85] fix --- sky/provision/kubernetes/network_utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index eb2475ee935..1613c5f4376 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -82,12 +82,8 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int, ) content = yaml.safe_load(cont) - # Separate the ingress_spec and services_spec from the content - ingress_spec = content['ingress_spec'] - services_spec = {k: v for k, v in content.items() if k != 'ingress_spec'} - # Return a dictionary containing both specs - return {'ingress_spec': ingress_spec, 'services_spec': services_spec} + return {'ingress_spec': content['ingress_spec'], 'services_spec': content['services_spec']} def create_or_replace_namespaced_ingress( From 9e0d910f8a0bd1a9a3e3c7c54afffec78e39c844 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 1 Mar 2024 23:47:20 -0800 Subject: [PATCH 17/85] fix --- sky/provision/kubernetes/network.py | 3 ++- sky/provision/kubernetes/network_utils.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 4ab9adad185..8bc628e6d8d 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -69,7 +69,8 @@ def _open_ports_using_ingress( service_details = [ (f'{cluster_name_on_cloud}-skypilot-service--{port}', port, _PATH_PREFIX.format(cluster_name_on_cloud=cluster_name_on_cloud, - port=port)) for port in ports + port=port).rstrip('/').lstrip('/')) + for port in ports ] # Generate ingress and services specs diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 1613c5f4376..b5440125e5b 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -83,7 +83,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int, content = yaml.safe_load(cont) # Return a dictionary containing both specs - return {'ingress_spec': content['ingress_spec'], 'services_spec': content['services_spec']} + return { + 'ingress_spec': content['ingress_spec'], + 'services_spec': content['services_spec'] + } def create_or_replace_namespaced_ingress( From 68bb6e816972830c1fd80f610d061e816d6c4e54 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 4 Mar 2024 13:04:36 -0800 Subject: [PATCH 18/85] remove autostop timing --- sky/serve/core.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/serve/core.py b/sky/serve/core.py index cd2be8ba344..7c33a24e4df 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -195,10 +195,11 @@ def up( # TODO(romilb): THIS NEEDS TO BE FIXED. If the user doesn't specify a cloud # for the controller in their ~/.sky/config.yaml, this idle_minutes_to_autostop # will be None and provisioning would fail on k8s with unsupported feature error. - idle_minutes_to_autostop = None if ( - controller_cloud and - controller_cloud.is_same_cloud(clouds.Kubernetes()) - ) else constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP + idle_minutes_to_autostop = None + # if ( + # controller_cloud and + # controller_cloud.is_same_cloud(clouds.Kubernetes()) + # ) else constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP controller_job_id, controller_handle = sky.launch( task=controller_task, stream_logs=False, From b72a814512667149a8d4c458db65c2759fee7f17 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 4 Mar 2024 16:32:29 -0800 Subject: [PATCH 19/85] Fix URLs for raw IP:ports --- sky/serve/load_balancer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index a96fc648470..3f2d76c29d1 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -87,6 +87,10 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_NAME]" ' 'to check the replica status.') + # If replica doesn't start with http or https, add http:// + if not ready_replica_url.startswith('http'): + ready_replica_url = 'http://' + ready_replica_url + path = f'{ready_replica_url}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) From a22b6d3a8ab24273a6f50f5c0baac993c2145021 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 7 Mar 2024 11:23:00 -0800 Subject: [PATCH 20/85] fixes --- sky/adaptors/kubernetes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index f5ab9eae98e..a57faa73655 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -63,9 +63,8 @@ def _load_config(): # See issue: https://github.com/skypilot-org/skypilot/issues/2287 os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc' os.environ['KUBERNETES_SERVICE_PORT'] = '443' - # TODO(romilb) - THIS IS HACK! FIX THIS BEFORE MERGING. raise kubernetes.config.config_exception.ConfigException - # kubernetes.config.load_incluster_config() + kubernetes.config.load_incluster_config() except kubernetes.config.config_exception.ConfigException: try: kubernetes.config.load_kube_config() From 4637f914c984146beb840f92bb64b016e0759c4d Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 7 Mar 2024 12:12:36 -0800 Subject: [PATCH 21/85] wip --- sky/adaptors/kubernetes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index a57faa73655..3b15995caab 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -63,7 +63,6 @@ def _load_config(): # See issue: https://github.com/skypilot-org/skypilot/issues/2287 os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc' os.environ['KUBERNETES_SERVICE_PORT'] = '443' - raise kubernetes.config.config_exception.ConfigException kubernetes.config.load_incluster_config() except kubernetes.config.config_exception.ConfigException: try: From d27ae5f40b06c69d6c96fb362925f5e0501004fa Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 7 Mar 2024 14:44:46 -0800 Subject: [PATCH 22/85] SA wip --- sky/provision/kubernetes/config.py | 48 +++++++++++++++++++++++++++-- sky/templates/kubernetes-ray.yml.j2 | 19 ++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 3649f123658..0c049072402 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -24,12 +24,18 @@ def bootstrap_instances( config = _configure_ssh_jump(namespace, config) - if not config.provider_config.get('_operator'): - # These steps are unecessary when using the Operator. + requested_service_account = config.node_config['metadata']['spec']['serviceAccountName'] + if requested_service_account == 'skypilot-service-account': + # If the user has requested a different service account, we assume they + # have already set up the necessary roles and role bindings. For + # skypilot-service-account, we set up the roles and role bindings here. _configure_autoscaler_service_account(namespace, config.provider_config) _configure_autoscaler_role(namespace, config.provider_config) _configure_autoscaler_role_binding(namespace, config.provider_config) - + _configure_autoscaler_cluster_role_binding(namespace, config.provider_config) + else: + logger.info(f'Using service account {requested_service_account!r}, ' + 'skipping role and role binding setup.') return config @@ -275,6 +281,42 @@ def _configure_autoscaler_role_binding(namespace: str, logger.info('_configure_autoscaler_role_binding: ' f'{created_msg(binding_field, name)}') +def _configure_autoscaler_cluster_role_binding(namespace: str, + provider_config: Dict[str, Any]) -> None: + binding_field = 'autoscaler_cluster_role_binding' + if binding_field not in provider_config: + logger.info('_configure_autoscaler_cluster_role_binding: ' + f'{not_provided_msg(binding_field)}') + return + + binding = provider_config[binding_field] + if 'namespace' not in binding['metadata']: + binding['metadata']['namespace'] = namespace + elif binding['metadata']['namespace'] != namespace: + raise InvalidNamespaceError(binding_field, namespace) + for subject in binding['subjects']: + if 'namespace' not in subject: + subject['namespace'] = namespace + elif subject['namespace'] != namespace: + subject_name = subject['name'] + raise InvalidNamespaceError( + binding_field + f' subject {subject_name}', namespace) + + name = binding['metadata']['name'] + field_selector = f'metadata.name={name}' + accounts = (kubernetes.auth_api().list_cluster_role_binding( + field_selector=field_selector).items) + if len(accounts) > 0: + assert len(accounts) == 1 + logger.info('_configure_autoscaler_cluster_role_binding: ' + f'{using_existing_msg(binding_field, name)}') + return + + logger.info('_configure_autoscaler_cluster_role_binding: ' + f'{not_found_msg(binding_field, name)}') + kubernetes.auth_api().create_cluster_role_binding(binding) + logger.info('_configure_autoscaler_cluster_role_binding: ' + f'{created_msg(binding_field, name)}') def _configure_ssh_jump(namespace, config: common.ProvisionConfig): """Creates a SSH jump pod to connect to the cluster. diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 90eccc878fe..2e41739b5e1 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -80,6 +80,25 @@ provider: name: skypilot-service-account-role apiGroup: rbac.authorization.k8s.io + + # In addition to a role binding, we also need a cluster role binding to give + # the SkyPilot access to the cluster-wide resources such as nodes to get + # node resources. + autoscaler_cluster_role_binding: + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + labels: + parent: skypilot + name: skypilot-service-account-cluster-role-binding + subjects: + - kind: ServiceAccount + name: skypilot-service-account + roleRef: + kind: ClusterRole + name: skypilot-service-account-role + apiGroup: rbac.authorization.k8s.io + services: # Service to expose the head node pod's SSH port. - apiVersion: v1 From cdc2e3f27b62e1aea837e96296494c19415160f1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 8 Mar 2024 00:09:13 -0800 Subject: [PATCH 23/85] Allow use of service accounts through remote_identity field --- .../cloud-setup/cloud-permissions/index.rst | 1 + .../cloud-permissions/kubernetes.rst | 177 ++++++++++++++++++ docs/source/reference/config.rst | 24 +++ sky/adaptors/kubernetes.py | 3 +- sky/backends/backend_utils.py | 7 +- sky/clouds/kubernetes.py | 38 ++-- sky/provision/kubernetes/config.py | 92 +++++++-- sky/provision/kubernetes/instance.py | 2 +- sky/provision/kubernetes/utils.py | 54 +++++- sky/serve/load_balancer.py | 12 ++ sky/templates/kubernetes-ray.yml.j2 | 64 ++++++- sky/utils/schemas.py | 9 +- 12 files changed, 440 insertions(+), 43 deletions(-) create mode 100644 docs/source/cloud-setup/cloud-permissions/kubernetes.rst diff --git a/docs/source/cloud-setup/cloud-permissions/index.rst b/docs/source/cloud-setup/cloud-permissions/index.rst index 873cbf339fc..e2a1aaf16ae 100644 --- a/docs/source/cloud-setup/cloud-permissions/index.rst +++ b/docs/source/cloud-setup/cloud-permissions/index.rst @@ -20,3 +20,4 @@ Table of Contents aws gcp vsphere + kubernetes diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst new file mode 100644 index 00000000000..1049e052f81 --- /dev/null +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -0,0 +1,177 @@ +.. _cloud-permissions-kubernetes: + +Kubernetes +========== + +SkyPilot requires permissions equivalent to the following roles to be able to manage the resources in the Kubernetes cluster: + +.. code-block:: yaml + + # Namespaced role for the service account + # Required for creating pods, services and other necessary resources in the namespace. + # Note these permissions only apply in the namespace where SkyPilot is deployed. + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-role + namespace: default + rules: + - apiGroups: ["*"] + resources: ["*"] + verbs: ["*"] + --- + # ClusterRole for accessing cluster-wide resources. Details for each resource below: + kind: ClusterRole + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-cluster-role + namespace: default + labels: + parent: skypilot + rules: + - apiGroups: [""] + resources: ["nodes"] # Required for getting node resources. + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "clusterrolebindings"] # Required for launching more SkyPilot clusters from within the pod. + verbs: ["get", "list", "watch"] + - apiGroups: ["node.k8s.io"] + resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. + verbs: ["get", "list", "watch"] + --- + # If using ingresses, role for accessing ingress service IP + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + namespace: ingress-nginx + name: sky-sa-role-ingress-nginx + rules: + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "get"] + + +To create a service account bound with these roles, you can use the following YAML: + +.. code-block:: yaml + + # create-sky-sa.yaml + kind: ServiceAccount + apiVersion: v1 + metadata: + name: sky-sa + namespace: default + labels: + parent: skypilot + --- + # Role for the service account + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-role + namespace: default + labels: + parent: skypilot + rules: + - apiGroups: ["*"] # Required for creating pods, services, secrets and other necessary resources in the namespace. + resources: ["*"] + verbs: ["*"] + --- + # RoleBinding for the service account + kind: RoleBinding + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-rb + namespace: default + labels: + parent: skypilot + subjects: + - kind: ServiceAccount + name: sky-sa + roleRef: + kind: Role + name: sky-sa-role + apiGroup: rbac.authorization.k8s.io + --- + # Role for accessing ingress resources + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + namespace: ingress-nginx + name: sky-sa-role-ingress-nginx + rules: + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "get"] + --- + # RoleBinding for accessing ingress resources + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: sky-sa-rolebinding-ingress-nginx + namespace: ingress-nginx + subjects: + - kind: ServiceAccount + name: sky-sa + namespace: default + roleRef: + kind: Role + name: sky-sa-role-ingress-nginx + apiGroup: rbac.authorization.k8s.io + --- + # ClusterRole for the service account + kind: ClusterRole + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-sa-cluster-role + namespace: default + labels: + parent: skypilot + rules: + - apiGroups: [""] + resources: ["nodes"] # Required for getting node resources. + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "clusterrolebindings"] # Required for launching more SkyPilot clusters from within the pod. + verbs: ["get", "list", "watch"] + - apiGroups: ["node.k8s.io"] + resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] # Required for exposing services. + resources: ["ingressclasses"] + verbs: ["get", "list", "watch"] + --- + # ClusterRoleBinding for the service account + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: sky-sa-cluster-role-binding + namespace: default + labels: + parent: skypilot + subjects: + - kind: ServiceAccount + name: sky-sa + namespace: default + roleRef: + kind: ClusterRole + name: sky-sa-cluster-role + apiGroup: rbac.authorization.k8s.io + +.. code-block:: bash + + kubectl apply -f create-sky-sa.yaml + +After creating the service account, you can configure SkyPilot to use it through ``~/.sky/config.yaml``: + +.. code-block:: yaml + + kubernetes: + remote_identity: sky-sa # Or your service account name + +If you would like SkyPilot to automatically create the service account and roles, you can use the following config: + +.. code-block:: yaml + + kubernetes: + remote_identity: SERVICE_ACCOUNT # Will automatically create the service account and roles diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index e775b903dae..c3cb5714845 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -257,6 +257,30 @@ Available fields and semantics: # for details on deploying the NGINX ingress controller. ports: loadbalancer + # Identity to use for all Kubernetes pods (optional). + # + # LOCAL_CREDENTIALS: The user's local ~/.kube/config will be uploaded to the + # Kubernetes pods created by SkyPilot. They are used for authenticating with + # the Kubernetes API server and launching new pods (e.g., for + # spot/serve controllers). + # + # SERVICE_ACCOUNT: Local ~/.kube/config is not uploaded to Kubernetes pods. + # SkyPilot will auto-create and reuse a service account with necessary roles + # in the user's namespace. + # + # : The name of a service account to use for all Kubernetes pods. + # This service account must exist in the user's namespace and have all + # necessary permissions. Refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # for details on the roles required by the service account. + # + # Using SERVICE_ACCOUNT or a custom service account only affects Kubernetes + # instances. Local ~/.kube/config will still be uploaded to non-Kubernetes + # instances (e.g., a serve controller on GCP or AWS may need to provision + # Kubernetes resources). + # + # Default: 'LOCAL_CREDENTIALS'. + remote_identity: my-k8s-service-account + # Additional fields to override the pod fields used by SkyPilot (optional) # # Any key:value pairs added here would get added to the pod spec used to diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index 3b15995caab..26f9ac1d121 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -81,7 +81,8 @@ def _load_config(): else: err_str = ( 'Failed to load Kubernetes configuration. ' - f'Please check if your kubeconfig file is valid.{suffix}') + 'Please check if your kubeconfig file exists at ' + f'~/.kube/config and is valid.{suffix}') err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.' with ux_utils.print_exception_no_traceback(): raise ValueError(err_str) from None diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 7cc27fea6cc..825fd223b5c 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -51,6 +51,7 @@ from sky.utils import subprocess_utils from sky.utils import timeline from sky.utils import ux_utils +from sky.utils import schemas if typing.TYPE_CHECKING: from sky import resources @@ -800,8 +801,10 @@ def write_cluster_config( assert cluster_name is not None excluded_clouds = [] remote_identity = skypilot_config.get_nested( - (str(cloud).lower(), 'remote_identity'), 'LOCAL_CREDENTIALS') - if remote_identity == 'SERVICE_ACCOUNT': + (str(cloud).lower(), 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + # For Kubernetes, remote_identity can be 'SERVICE_ACCOUNT', + # 'LOCAL_CREDENTIALS' or a string for the service account to use. + if remote_identity != 'LOCAL_CREDENTIALS': if not cloud.supports_service_account_on_remote(): raise exceptions.InvalidCloudConfigs( 'remote_identity: SERVICE_ACCOUNT is specified in ' diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 885de4ab3c1..7b5ba33fb61 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -13,6 +13,8 @@ from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import common_utils from sky.utils import resources_utils +from sky.utils import schemas + if typing.TYPE_CHECKING: # Renaming to avoid shadowing variables. @@ -33,6 +35,7 @@ class Kubernetes(clouds.Cloud): SKY_SSH_KEY_SECRET_FIELD_NAME = \ f'ssh-publickey-{common_utils.get_user_hash()}' SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod' + SKY_DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account' PORT_FORWARD_PROXY_CMD_TEMPLATE = \ 'kubernetes-port-forward-proxy-command.sh.j2' PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/port-forward-proxy-cmd.sh' @@ -46,6 +49,8 @@ class Kubernetes(clouds.Cloud): # TODO(romilb): Make the timeout configurable. TIMEOUT = 10 + _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True + _DEFAULT_NUM_VCPUS = 2 _DEFAULT_MEMORY_CPU_RATIO = 1 _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks @@ -259,6 +264,20 @@ def make_deploy_resources_variables( port_mode = network_utils.get_port_mode(None) dshm_size_limit = skypilot_config.get_nested(('kubernetes', 'dshm_size_limit'), None) + remote_identity = skypilot_config.get_nested(('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + if remote_identity == 'LOCAL_CREDENTIALS': + # SA name doesn't matter since automounting credentials will be turned off + k8s_service_account_name = 'default' + k8s_automount_service_account_token = 'false' + elif remote_identity == 'SERVICE_ACCOUNT': + # Use the default service account + k8s_service_account_name = self.SKY_DEFAULT_SERVICE_ACCOUNT_NAME + k8s_automount_service_account_token = 'true' + else: + # User specified a custom service account + k8s_service_account_name = remote_identity + k8s_automount_service_account_token = 'true' + deploy_vars = { 'instance_type': resources.instance_type, @@ -277,7 +296,8 @@ def make_deploy_resources_variables( 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, 'k8s_ssh_jump_image': ssh_jump_image, 'k8s_dshm_size_limit': dshm_size_limit, - # TODO(romilb): Allow user to specify custom images + 'k8s_service_account_name': k8s_service_account_name, + 'k8s_automount_service_account_token': k8s_automount_service_account_token, 'image_id': image_id, } @@ -347,16 +367,12 @@ def _make(instance_list): @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: - if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)): - # Test using python API - try: - return kubernetes_utils.check_credentials() - except Exception as e: # pylint: disable=broad-except - return (False, 'Credential check failed: ' - f'{common_utils.format_exception(e)}') - else: - return (False, 'Credentials not found - ' - f'check if {CREDENTIAL_PATH} exists.') + # Test using python API + try: + return kubernetes_utils.check_credentials() + except Exception as e: # pylint: disable=broad-except + return (False, 'Credential check failed: ' + f'{common_utils.format_exception(e)}') def get_credential_file_mounts(self) -> Dict[str, str]: if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)): diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 0c049072402..1d35a7d8880 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -24,16 +24,24 @@ def bootstrap_instances( config = _configure_ssh_jump(namespace, config) - requested_service_account = config.node_config['metadata']['spec']['serviceAccountName'] + requested_service_account = config.node_config['spec']['serviceAccountName'] if requested_service_account == 'skypilot-service-account': - # If the user has requested a different service account, we assume they - # have already set up the necessary roles and role bindings. For - # skypilot-service-account, we set up the roles and role bindings here. + # If the user has requested a different service account (via pod_config + # in ~/.sky/config.yaml), we assume they have already set up the + # necessary roles and role bindings. + # If not, set up the roles and bindings for skypilot-service-account + # here. _configure_autoscaler_service_account(namespace, config.provider_config) - _configure_autoscaler_role(namespace, config.provider_config) - _configure_autoscaler_role_binding(namespace, config.provider_config) + _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_role') + _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_role_binding') + _configure_autoscaler_cluster_role(namespace, config.provider_config) _configure_autoscaler_cluster_role_binding(namespace, config.provider_config) - else: + if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress': + logger.info('Port mode is set to ingress, setting up ingress role and role binding') + _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_ingress_role') + _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_ingress_role_binding') + + elif requested_service_account != 'default': logger.info(f'Using service account {requested_service_account!r}, ' 'skipping role and role binding setup.') return config @@ -215,8 +223,16 @@ def _configure_autoscaler_service_account( def _configure_autoscaler_role(namespace: str, - provider_config: Dict[str, Any]) -> None: - role_field = 'autoscaler_role' + provider_config: Dict[str, Any], + role_field: str) -> None: + """ Reads the role from the provider config, creates if it does not exist. + + Args: + namespace: The namespace to create the role in. + provider_config: The provider config. + role_field: The field in the provider config that contains the role. + """ + if role_field not in provider_config: logger.info('_configure_autoscaler_role: ' f'{not_provided_msg(role_field)}') @@ -225,8 +241,8 @@ def _configure_autoscaler_role(namespace: str, role = provider_config[role_field] if 'namespace' not in role['metadata']: role['metadata']['namespace'] = namespace - elif role['metadata']['namespace'] != namespace: - raise InvalidNamespaceError(role_field, namespace) + else: + namespace = role['metadata']['namespace'] name = role['metadata']['name'] field_selector = f'metadata.name={name}' @@ -245,8 +261,16 @@ def _configure_autoscaler_role(namespace: str, def _configure_autoscaler_role_binding(namespace: str, - provider_config: Dict[str, Any]) -> None: - binding_field = 'autoscaler_role_binding' + provider_config: Dict[str, Any], + binding_field: str) -> None: + """ Reads the role binding from the config, creates if it does not exist. + + Args: + namespace: The namespace to create the role binding in. + provider_config: The provider config. + binding_field: The field in the provider config that contains the role + """ + if binding_field not in provider_config: logger.info('_configure_autoscaler_role_binding: ' f'{not_provided_msg(binding_field)}') @@ -255,8 +279,10 @@ def _configure_autoscaler_role_binding(namespace: str, binding = provider_config[binding_field] if 'namespace' not in binding['metadata']: binding['metadata']['namespace'] = namespace - elif binding['metadata']['namespace'] != namespace: - raise InvalidNamespaceError(binding_field, namespace) + rb_namespace = namespace + else: + rb_namespace = binding['metadata']['namespace'] + for subject in binding['subjects']: if 'namespace' not in subject: subject['namespace'] = namespace @@ -268,7 +294,7 @@ def _configure_autoscaler_role_binding(namespace: str, name = binding['metadata']['name'] field_selector = f'metadata.name={name}' accounts = (kubernetes.auth_api().list_namespaced_role_binding( - namespace, field_selector=field_selector).items) + rb_namespace, field_selector=field_selector).items) if len(accounts) > 0: assert len(accounts) == 1 logger.info('_configure_autoscaler_role_binding: ' @@ -277,12 +303,40 @@ def _configure_autoscaler_role_binding(namespace: str, logger.info('_configure_autoscaler_role_binding: ' f'{not_found_msg(binding_field, name)}') - kubernetes.auth_api().create_namespaced_role_binding(namespace, binding) + kubernetes.auth_api().create_namespaced_role_binding(rb_namespace, binding) logger.info('_configure_autoscaler_role_binding: ' f'{created_msg(binding_field, name)}') -def _configure_autoscaler_cluster_role_binding(namespace: str, - provider_config: Dict[str, Any]) -> None: +def _configure_autoscaler_cluster_role(namespace, provider_config: Dict[str, Any]) -> None: + role_field = 'autoscaler_cluster_role' + if role_field not in provider_config: + logger.info('_configure_autoscaler_cluster_role: ' + f'{not_provided_msg(role_field)}') + return + + role = provider_config[role_field] + if 'namespace' not in role['metadata']: + role['metadata']['namespace'] = namespace + elif role['metadata']['namespace'] != namespace: + raise InvalidNamespaceError(role_field, namespace) + + name = role['metadata']['name'] + field_selector = f'metadata.name={name}' + accounts = (kubernetes.auth_api().list_cluster_role( + field_selector=field_selector).items) + if len(accounts) > 0: + assert len(accounts) == 1 + logger.info('_configure_autoscaler_cluster_role: ' + f'{using_existing_msg(role_field, name)}') + return + + logger.info('_configure_autoscaler_cluster_role: ' + f'{not_found_msg(role_field, name)}') + kubernetes.auth_api().create_cluster_role(role) + logger.info(f'_configure_autoscaler_cluster_role: {created_msg(role_field, name)}') + + +def _configure_autoscaler_cluster_role_binding(namespace, provider_config: Dict[str, Any]) -> None: binding_field = 'autoscaler_cluster_role_binding' if binding_field not in provider_config: logger.info('_configure_autoscaler_cluster_role_binding: ' diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index e07f7c31fa2..ac05a1bf246 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -209,7 +209,7 @@ def _wait_for_pods_to_run(namespace, new_nodes): node.metadata.name, namespace) # Continue if pod and all the containers within the - # pod are succesfully created and running. + # pod are successfully created and running. if pod.status.phase == 'Running' and all( container.state.running for container in pod.status.container_statuses): diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7feea920dba..1e2b3ddcea6 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -18,6 +18,7 @@ from sky.utils import common_utils from sky.utils import env_options from sky.utils import kubernetes_enums +from sky.utils import schemas from sky.utils import ux_utils DEFAULT_NAMESPACE = 'default' @@ -509,21 +510,66 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ except Exception as e: # pylint: disable=broad-except return False, ('An error occurred: ' f'{common_utils.format_exception(e, use_bracket=True)}') - # If we reach here, the credentials are valid and Kubernetes cluster is up + + # If we reach here, the credentials are valid and Kubernetes cluster is up. + # We now do softer checks to check if exec based auth is used and to + # see if the cluster is GPU-enabled. + + # Check if exec based auth is used + exec_msg = '' + k8s = kubernetes.get_kubernetes() + try: + k8s.config.load_kube_config() + except kubernetes.config_exception(): + pass # Using service account token or other auth methods, continue + else: + # Get active context and user from kubeconfig using k8s api + _, current_context = k8s.config.list_kube_config_contexts() + target_username = current_context['context']['user'] + + # K8s api does not provide a mechanism to get the user details from the + # context. We need to load the kubeconfig file and parse it to get the + # user details. + kubeconfig_path = os.path.expanduser(os.getenv('KUBECONFIG', + k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION)) + # Load the kubeconfig file as a dictionary + with open(kubeconfig_path, 'r') as f: + kubeconfig = yaml.safe_load(f) + + user_details = kubeconfig['users'] + + # Find user matching the target username + user_details = next(user for user in user_details if user['name'] == target_username) + + remote_identity = skypilot_config.get_nested(('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + if ('exec' in user_details.get('user', {}) and + remote_identity == 'LOCAL_CREDENTIALS'): + ctx_name = current_context['name'] + exec_msg = ('exec-based authentication is used for ' + f'Kubernetes context {ctx_name!r}.' + ' This may cause issues when running Managed Spot ' + 'or SkyServe controller on Kubernetes. To fix, configure SkyPilot to create a service account for running pods by adding ' + 'the following in ~/.sky/config.yaml:\n kubernetes:\n remote_identity: SERVICE_ACCOUNT\n More: https://skypilot.readthedocs.io/en/latest/reference/config.html') + # We now check if GPUs are available and labels are set correctly on the # cluster, and if not we return hints that may help debug any issues. # This early check avoids later surprises for user when they try to run # `sky launch --gpus ` and the optimizer does not list Kubernetes as a # provider if their cluster GPUs are not setup correctly. + gpu_msg = '' try: _, _ = get_gpu_label_key_value(acc_type='', check_mode=True) except exceptions.ResourcesUnavailableError as e: # If GPUs are not available, we return cluster as enabled (since it can # be a CPU-only cluster) but we also return the exception message which # serves as a hint for how to enable GPU access. - return True, f'{e}' - return True, None - + gpu_msg = str(e) + if exec_msg and gpu_msg: + return True, f'{gpu_msg}\n Additionally, {exec_msg}' + elif gpu_msg or exec_msg: + return True, gpu_msg or exec_msg + else: + return True, None def get_current_kube_config_context_name() -> Optional[str]: """Get the current kubernetes context from the kubeconfig file diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 3f2d76c29d1..13da18cc9cb 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -118,3 +118,15 @@ def run_load_balancer(controller_addr: str, load_balancer_port: int): load_balancer = SkyServeLoadBalancer(controller_url=controller_addr, load_balancer_port=load_balancer_port) load_balancer.run() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--controller-addr', required=True, + default='127.0.0.1', + help='The address of the controller.') + parser.add_argument('--load-balancer-port', type=int, required=True, + default=8890, + help='The port where the load balancer listens to.') + args = parser.parse_args() + run_load_balancer(args.controller_addr, args.load_balancer_port) \ No newline at end of file diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 2e41739b5e1..faf489cdb26 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -81,9 +81,65 @@ provider: apiGroup: rbac.authorization.k8s.io + # Role to access ingress services for fetching IP + autoscaler_ingress_role: + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + namespace: ingress-nginx + name: skypilot-service-account-ingress-role + labels: + parent: skypilot + rules: + - apiGroups: [ "" ] + resources: [ "services" ] + verbs: [ "list", "get", "watch" ] + - apiGroups: [ "rbac.authorization.k8s.io" ] + resources: [ "roles", "rolebindings" ] + verbs: [ "get", "list", "watch" ] + + # RoleBinding to access ingress services for fetching IP + autoscaler_ingress_role_binding: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + namespace: ingress-nginx + name: skypilot-service-account-ingress-role-binding + labels: + parent: skypilot + subjects: + - kind: ServiceAccount + name: skypilot-service-account + roleRef: + kind: Role + name: skypilot-service-account-ingress-role + apiGroup: rbac.authorization.k8s.io + # In addition to a role binding, we also need a cluster role binding to give # the SkyPilot access to the cluster-wide resources such as nodes to get # node resources. + autoscaler_cluster_role: + kind: ClusterRole + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + labels: + parent: skypilot + name: skypilot-service-account-cluster-role + rules: + - apiGroups: [ "" ] + resources: [ "nodes" ] # Required for getting node resources. + verbs: [ "get", "list", "watch" ] + - apiGroups: [ "rbac.authorization.k8s.io" ] + resources: [ "clusterroles", "clusterrolebindings" ] # Required for launching more SkyPilot clusters from within the pod. + verbs: [ "get", "list", "watch" ] + - apiGroups: [ "node.k8s.io" ] + resources: [ "runtimeclasses" ] # Required for autodetecting the runtime class of the nodes. + verbs: [ "get", "list", "watch" ] + - apiGroups: [ "networking.k8s.io" ] # Required for exposing services. + resources: [ "ingressclasses" ] + verbs: [ "get", "list", "watch" ] + + # Bind cluster role to the service account autoscaler_cluster_role_binding: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -96,7 +152,7 @@ provider: name: skypilot-service-account roleRef: kind: ClusterRole - name: skypilot-service-account-role + name: skypilot-service-account-cluster-role apiGroup: rbac.authorization.k8s.io services: @@ -159,9 +215,9 @@ available_node_types: # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod. skypilot-ssh-jump: {{k8s_ssh_jump_name}} spec: - # Change this if you altered the autoscaler_service_account above - # or want to provide your own. - serviceAccountName: skypilot-service-account + # serviceAccountName: skypilot-service-account + serviceAccountName: {{k8s_service_account_name}} + automountServiceAccountToken: {{k8s_automount_service_account_token}} restartPolicy: Never diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 34b623e6a7c..fd204b143f6 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -534,6 +534,7 @@ def get_cluster_schema(): } } +REMOTE_IDENTITY_DEFAULT = 'LOCAL_CREDENTIALS' def get_config_schema(): # pylint: disable=import-outside-toplevel @@ -616,6 +617,9 @@ def get_config_schema(): }, 'dshm_size_limit': { 'type': 'string' + }, + 'remote_identity': { + 'type': 'string' } } }, @@ -647,7 +651,10 @@ def get_config_schema(): } for config in cloud_configs.values(): - config['properties'].update(_REMOTE_IDENTITY_SCHEMA) + for key in _REMOTE_IDENTITY_SCHEMA: + if key not in config['properties']: # Add if not already present + config['properties'].update(_REMOTE_IDENTITY_SCHEMA) + break return { '$schema': 'https://json-schema.org/draft/2020-12/schema', 'type': 'object', From 643fb776043b55a7cf4c97b25c21d05965e88b32 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 8 Mar 2024 09:13:53 -0800 Subject: [PATCH 24/85] Make purge work for no clusters in kubeconfig --- sky/backends/cloud_vm_ray_backend.py | 8 ++++++++ sky/cli.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 7015953a40f..23a14a40ad2 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3971,6 +3971,14 @@ def post_teardown_cleanup(self, pass except exceptions.PortDoesNotExistError: logger.debug('Ports do not exist. Skipping cleanup.') + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning( + f'Failed to cleanup ports. Skipping since purge is ' + f'set. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + else: + raise # The cluster file must exist because the cluster_yaml will only # be removed after the cluster entry in the database is removed. diff --git a/sky/cli.py b/sky/cli.py index 73e799024c4..69fc7d0d531 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3014,7 +3014,7 @@ def _down_or_stop_clusters( hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] try: hint_or_raise(controller_name) - except exceptions.ClusterOwnerIdentityMismatchError as e: + except (exceptions.ClusterOwnerIdentityMismatchError, RuntimeError) as e: if purge: click.echo(common_utils.format_exception(e)) else: From 7649fcb8e9307c0a2cdead91f84ed11956c30288 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 8 Mar 2024 10:43:38 -0800 Subject: [PATCH 25/85] Handle ingress namespace not present --- sky/provision/kubernetes/config.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 1d35a7d8880..605e164142c 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -38,8 +38,16 @@ def bootstrap_instances( _configure_autoscaler_cluster_role_binding(namespace, config.provider_config) if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress': logger.info('Port mode is set to ingress, setting up ingress role and role binding') - _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_ingress_role') - _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_ingress_role_binding') + try: + _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_ingress_role') + _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_ingress_role_binding') + except kubernetes.api_exception() as e: + # If namespace is not found, we will ignore the error + if e.status == 404: + logger.info(f'Namespace not found - is your nginx ingress installed? Skipping ingress role and role binding setup.') + else: + raise e + elif requested_service_account != 'default': logger.info(f'Using service account {requested_service_account!r}, ' From 8c76eb61b9b394542d8f599e8604200ee2e9ddf2 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 8 Mar 2024 15:10:21 -0800 Subject: [PATCH 26/85] setup optimizations and critical SA key fix --- sky/clouds/kubernetes.py | 3 -- sky/templates/sky-serve-controller.yaml.j2 | 7 ++- sky/utils/controller_utils.py | 57 ++++++++++++++-------- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 7b5ba33fb61..b8d0df4b86c 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -379,9 +379,6 @@ def get_credential_file_mounts(self) -> Dict[str, str]: # Upload kubeconfig to the default path to avoid having to set # KUBECONFIG in the environment. return { - # TODO(romilb): Fix before merging. - '~/.ssh/sky-key': '~/.ssh/sky-key', - '~/.ssh/sky-key.pub': '~/.ssh/sky-key.pub', DEFAULT_KUBECONFIG_PATH: CREDENTIAL_PATH } else: diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 179cd2a7efb..17e143da9b4 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -11,13 +11,16 @@ setup: | {%- endfor %} # Install serve dependencies. - pip list | grep uvicorn > /dev/null 2>&1 || pip install uvicorn > /dev/null 2>&1 - pip list | grep fastapi > /dev/null 2>&1 || pip install fastapi > /dev/null 2>&1 + pip list | grep uvicorn || pip install uvicorn + pip list | grep fastapi || pip install fastapi file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} {{remote_user_config_path}}: skypilot:local_skypilot_config_path ~/.sky/catalogs: ~/.sky/catalogs + # TODO(romilb): Fix before merging. + ~/.ssh/sky-key: ~/.ssh/sky-key + ~/.ssh/sky-key.pub: ~/.ssh/sky-key.pub run: | # Start sky serve service. diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index cc87bbe97a2..93db5ee4483 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -134,26 +134,30 @@ def from_name(cls, name: Optional[str]) -> Optional['Controllers']: # TODO(zhwu): Keep the dependencies align with the ones in setup.py def _get_cloud_dependencies_installation_commands( controller_type: str) -> List[str]: - commands = [ - # aws - 'pip list | grep boto3 > /dev/null 2>&1 || ' - 'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 ' - 'boto3>=1.26.1 > /dev/null 2>&1', - # gcp - 'pip list | grep google-api-python-client > /dev/null 2>&1 || ' - 'pip install google-api-python-client>=2.69.0 google-cloud-storage ' - '> /dev/null 2>&1', - f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}', - ] - # k8s and ibm doesn't support open port and spot instance yet, so we don't - # install them for either controller. - if controller_type == 'spot': - # oci doesn't support open port yet, so we don't install oci - # dependencies for sky serve controller. - commands.append('pip list | grep oci > /dev/null 2>&1 || ' - 'pip install oci > /dev/null 2>&1') + commands = [] + enabled_clouds = global_user_state.get_enabled_clouds() # TODO(tian): Make dependency installation command a method of cloud # class and get all installation command for enabled clouds. + # AWS + if any( + cloud.is_same_cloud(clouds.AWS()) + for cloud in enabled_clouds): + commands.append( + 'pip list | grep boto3 > /dev/null 2>&1 || ' + 'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 ' + 'boto3>=1.26.1 > /dev/null 2>&1' + ) + # GCP + if any( + cloud.is_same_cloud(clouds.GCP()) + for cloud in enabled_clouds): + commands.extend( + ['pip list | grep google-api-python-client > /dev/null 2>&1 || ' + 'pip install google-api-python-client>=2.69.0 google-cloud-storage ' + '> /dev/null 2>&1', + f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}'] + ) + # Azure if any( cloud.is_same_cloud(clouds.Azure()) for cloud in global_user_state.get_enabled_clouds()): @@ -161,6 +165,7 @@ def _get_cloud_dependencies_installation_commands( 'pip list | grep azure-cli > /dev/null 2>&1 || ' 'pip install azure-cli>=2.31.0 azure-core azure-identity>=1.13.0 ' 'azure-mgmt-network > /dev/null 2>&1') + # Kubernetes if any( cloud.is_same_cloud(clouds.Kubernetes()) for cloud in global_user_state.get_enabled_clouds()): @@ -173,7 +178,21 @@ def _get_cloud_dependencies_installation_commands( 'then apt update && apt install curl socat netcat -y; ' 'fi" && ' # Install kubectl - '(command -v kubectl &>/dev/null || (curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl)) && ') + '(command -v kubectl &>/dev/null || (curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl)) && ' + ) + # OCI + if controller_type == 'spot': + # oci doesn't support open port yet, so we don't install oci + # dependencies for sky serve controller. + if any( + cloud.is_same_cloud(clouds.OCI()) + for cloud in enabled_clouds): + commands.append( + 'pip list | grep oci > /dev/null 2>&1 || ' + 'pip install oci > /dev/null 2>&1' + ) + # ibm doesn't support open port and spot instance yet, so we don't + # install them for either controller. return commands From 2e20600b70cfbcbc0f1a8b016825d8257df6a48a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 16 Mar 2024 06:14:52 +0530 Subject: [PATCH 27/85] fix docs --- docs/source/cloud-setup/cloud-permissions/kubernetes.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index 1049e052f81..e8142e5f590 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -50,13 +50,15 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma resources: ["services"] verbs: ["list", "get"] +Example Service Account YAML +---------------------------- To create a service account bound with these roles, you can use the following YAML: .. code-block:: yaml # create-sky-sa.yaml - kind: ServiceAccount + kind: ServiceAccount apiVersion: v1 metadata: name: sky-sa From de6a99c419ad5c2d7c0b9dc26f3177af639208e3 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 16 Mar 2024 07:17:26 +0530 Subject: [PATCH 28/85] fix docs --- docs/source/cloud-setup/cloud-permissions/kubernetes.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index e8142e5f590..e2ef3e325bc 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -104,7 +104,10 @@ To create a service account bound with these roles, you can use the following YA rules: - apiGroups: [""] resources: ["services"] - verbs: ["list", "get"] + verbs: ["list", "get", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["list", "get", "watch"] --- # RoleBinding for accessing ingress resources apiVersion: rbac.authorization.k8s.io/v1 From 03e44ebd5f60ed1fdeeb28421b12a06707b4559b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 24 Mar 2024 12:07:29 -0700 Subject: [PATCH 29/85] Add support for skypilot.co/external-ip annotation for ingress --- .../reference/kubernetes/kubernetes-setup.rst | 18 +++++++++++++++++- sky/clouds/kubernetes.py | 18 +++++++++--------- sky/provision/kubernetes/network_utils.py | 12 +++++++++--- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst index 9e797349f72..c1a4a664906 100644 --- a/docs/source/reference/kubernetes/kubernetes-setup.rst +++ b/docs/source/reference/kubernetes/kubernetes-setup.rst @@ -382,7 +382,7 @@ To use this mode: # ingress-nginx-controller LoadBalancer 10.24.4.254 35.202.58.117 80:31253/TCP,443:32699/TCP .. note:: - If the ``EXTERNAL-IP`` field is ````, you must to manually assign a External IP. + If the ``EXTERNAL-IP`` field is ````, you may manually assign it a External IP. This can be done by patching the service with an IP that can be accessed from outside the cluster. If the service type is ``NodePort``, you can set the ``EXTERNAL-IP`` to any node's IP address: @@ -395,6 +395,22 @@ To use this mode: If the ``EXTERNAL-IP`` field is left as ````, SkyPilot will use ``localhost`` as the external IP for the Ingress, and the endpoint may not be accessible from outside the cluster. +.. note:: + If you cannot update the ``EXTERNAL-IP`` field of the service, you can also + specify the Ingress IP or hostname through the ``skypilot.co/external-ip`` + annotation on the ``ingress-nginx-controller`` service. In this case, + having a valid ``EXTERNAL-IP`` field is not required. + + For example, if your ``ingress-nginx-controller`` service is ``NodePort``: + + .. code-block:: bash + + # Add skypilot.co/external-ip annotation to the nginx ingress service. + # Replace in the following command with the IP you select. + # Can be any node's IP if using NodePort service type. + $ kubectl annotate service ingress-nginx-controller skypilot.co/external-ip= -n ingress-nginx + + 3. Update the :ref:`SkyPilot config ` at :code:`~/.sky/config` to use the ingress mode. .. code-block:: yaml diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index b8d0df4b86c..8f58a17d159 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -84,15 +84,15 @@ def _unsupported_features_for_resources( ) -> Dict[clouds.CloudImplementationFeatures, str]: unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES curr_context = kubernetes_utils.get_current_kube_config_context_name() - if curr_context == kubernetes_utils.KIND_CONTEXT_NAME: - # If we are using KIND, the loadbalancer service will never be - # assigned an external IP. Users may use ingress, but that requires - # blocking HTTP port 80. - # For now, we disable port opening feature on kind clusters. - unsupported_features[ - clouds.CloudImplementationFeatures.OPEN_PORTS] = ( - 'Opening ports is not supported in Kubernetes when ' - 'using local kind cluster.') + # if curr_context == kubernetes_utils.KIND_CONTEXT_NAME: + # # If we are using KIND, the loadbalancer service will never be + # # assigned an external IP. Users may use ingress, but that requires + # # blocking HTTP port 80. + # # For now, we disable port opening feature on kind clusters. + # unsupported_features[ + # clouds.CloudImplementationFeatures.OPEN_PORTS] = ( + # 'Opening ports is not supported in Kubernetes when ' + # 'using local kind cluster.') return unsupported_features @classmethod diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index b5440125e5b..efd731c482d 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -190,11 +190,17 @@ def get_ingress_external_ip_and_ports( ingress_service = ingress_services[0] if ingress_service.status.load_balancer.ingress is None: - # Try to use assigned external IP if it exists, - # otherwise return 'localhost' + # We try to get an IP/host for the service in the following order: + # 1. Try to use assigned external IP if it exists + # 2. Use the skypilot.co/external-ip annotation in the service + # 3. Otherwise return 'localhost' + ip = None if ingress_service.spec.external_i_ps is not None: ip = ingress_service.spec.external_i_ps[0] - else: + elif ingress_service.metadata.annotations is not None: + ip = ingress_service.metadata.annotations.get( + 'skypilot.co/external-ip', None) + if ip is None: ip = 'localhost' ports = ingress_service.spec.ports http_port = [port for port in ports if port.name == 'http'][0].node_port From 7436a7e4e57b25f8c7b726208f9d65beb34a2715 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 25 Mar 2024 15:02:11 -0700 Subject: [PATCH 30/85] Remove dshm_size_limit --- docs/source/reference/config.rst | 7 ------- sky/clouds/kubernetes.py | 2 -- sky/templates/kubernetes-ray.yml.j2 | 3 --- sky/utils/schemas.py | 3 --- 4 files changed, 15 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index c3cb5714845..96f515d5f18 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -315,13 +315,6 @@ Available fields and semantics: medium: Memory sizeLimit: 3Gi # Set a size limit for the /dev/shm volume - # Size of the /dev/shm shared memory for the pod (optional). - # - # Defaults to None, which means no size limits are set. If set, the value - # must be a string that is a valid Kubernetes quantity, e.g., "3Gi". - # https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/quantity/ - dshm_size_limit: 3Gi - # Advanced OCI configurations (optional). oci: # A dict mapping region names to region-specific configurations, or diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 8f58a17d159..cbf6db042e2 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -263,7 +263,6 @@ def make_deploy_resources_variables( port_mode = network_utils.get_port_mode(None) - dshm_size_limit = skypilot_config.get_nested(('kubernetes', 'dshm_size_limit'), None) remote_identity = skypilot_config.get_nested(('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) if remote_identity == 'LOCAL_CREDENTIALS': # SA name doesn't matter since automounting credentials will be turned off @@ -295,7 +294,6 @@ def make_deploy_resources_variables( 'k8s_acc_label_value': k8s_acc_label_value, 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, 'k8s_ssh_jump_image': ssh_jump_image, - 'k8s_dshm_size_limit': dshm_size_limit, 'k8s_service_account_name': k8s_service_account_name, 'k8s_automount_service_account_token': k8s_automount_service_account_token, 'image_id': image_id, diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index faf489cdb26..1796c19c6fa 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -237,9 +237,6 @@ available_node_types: - name: dshm emptyDir: medium: Memory - {% if k8s_dshm_size_limit is not none %} - sizeLimit: {{k8s_dshm_size_limit}} - {% endif %} - name: dev-fuse # Required for fuse mounting hostPath: path: /dev/fuse diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index fd204b143f6..d54133e5cbb 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -615,9 +615,6 @@ def get_config_schema(): # Allow arbitrary keys since validating pod spec is hard 'additionalProperties': True, }, - 'dshm_size_limit': { - 'type': 'string' - }, 'remote_identity': { 'type': 'string' } From 3cc53528b834fbc87aeadf7aeffe64943440f84b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 25 Mar 2024 15:03:20 -0700 Subject: [PATCH 31/85] Undo kind changes --- sky/clouds/kubernetes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index cbf6db042e2..047f79b4781 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -84,15 +84,15 @@ def _unsupported_features_for_resources( ) -> Dict[clouds.CloudImplementationFeatures, str]: unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES curr_context = kubernetes_utils.get_current_kube_config_context_name() - # if curr_context == kubernetes_utils.KIND_CONTEXT_NAME: - # # If we are using KIND, the loadbalancer service will never be - # # assigned an external IP. Users may use ingress, but that requires - # # blocking HTTP port 80. - # # For now, we disable port opening feature on kind clusters. - # unsupported_features[ - # clouds.CloudImplementationFeatures.OPEN_PORTS] = ( - # 'Opening ports is not supported in Kubernetes when ' - # 'using local kind cluster.') + if curr_context == kubernetes_utils.KIND_CONTEXT_NAME: + # If we are using KIND, the loadbalancer service will never be + # assigned an external IP. Users may use ingress, but that requires + # blocking HTTP port 80. + # For now, we disable port opening feature on kind clusters. + unsupported_features[ + clouds.CloudImplementationFeatures.OPEN_PORTS] = ( + 'Opening ports is not supported in Kubernetes when ' + 'using local kind cluster.') return unsupported_features @classmethod From d836f17d00174af2c4f9f4bbcabd944cefcab8fc Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 25 Mar 2024 15:36:47 -0700 Subject: [PATCH 32/85] Update service account docs --- .../cloud-permissions/kubernetes.rst | 75 ++++++++++++++++--- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index e2ef3e325bc..87c1b747689 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -3,6 +3,59 @@ Kubernetes ========== +When running outside your Kubernetes cluster, SkyPilot uses your local ``~/.kube/config`` file +for authentication and creating resources on your Kubernetes cluster. + +When running inside your Kubernetes cluster (e.g., as a Spot controller or Serve controller), +SkyPilot can operate using either of the following three authentication methods: + +1. **Using your local kubeconfig file**: In this case, SkyPilot will + copy your local ``~/.kube/config`` file to the controller pod and use it for + authentication. This is the default method when running inside the cluster, + and no additional configuration is required. + + .. note:: + + If your cluster uses exec based authentication in your ``~/.kube/config`` file, + SkyPilot may not be able to authenticate using this method. In this case, + consider using the service account methods below. + +2. **Creating a service account**: SkyPilot can automatically create the service + account and roles for itself to manage resources in the Kubernetes cluster. + To use this method, set ``remote_identity: SERVICE_ACCOUNT`` to your + Kubernetes configuration in the ``~/.sky/config.yaml`` file: + + .. code-block:: yaml + + kubernetes: + remote_identity: SERVICE_ACCOUNT + + For details on the permissions that are granted to the service account, + refer to the `Permissions required by SkyPilot`_ section below. + +3. **Using a custom service account**: If you have a custom service account + with the `necessary permissions `__, you can configure + SkyPilot to use it by adding this to your ``~/.sky/config.yaml`` file: + + .. code-block:: yaml + + kubernetes: + remote_identity: your-service-account-name + +.. note:: + + Service account based authentication applies only when the SkyPiolt + controller is running inside the Kubernetes cluster. When running outside + the cluster (e.g., on AWS), SkyPilot will use the local ``~/.kube/config`` + file for authentication. + +Below are the permissions required by SkyPilot and an example service account YAML that you can use to create a service account with the necessary permissions. + +.. _k8s-permissions: + +Permissions required by SkyPilot +-------------------------------- + SkyPilot requires permissions equivalent to the following roles to be able to manage the resources in the Kubernetes cluster: .. code-block:: yaml @@ -50,10 +103,13 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma resources: ["services"] verbs: ["list", "get"] -Example Service Account YAML ----------------------------- -To create a service account bound with these roles, you can use the following YAML: +.. _k8s-sa-example: + +Example using Custom Service Account +------------------------------------ + +To create a service account that has the necessary permissions for SkyPilot, you can use the following YAML: .. code-block:: yaml @@ -163,20 +219,15 @@ To create a service account bound with these roles, you can use the following YA name: sky-sa-cluster-role apiGroup: rbac.authorization.k8s.io +Create the service account using the following command: + .. code-block:: bash - kubectl apply -f create-sky-sa.yaml + $ kubectl apply -f create-sky-sa.yaml -After creating the service account, you can configure SkyPilot to use it through ``~/.sky/config.yaml``: +After creating the service account, configure SkyPilot to use it through ``~/.sky/config.yaml``: .. code-block:: yaml kubernetes: remote_identity: sky-sa # Or your service account name - -If you would like SkyPilot to automatically create the service account and roles, you can use the following config: - -.. code-block:: yaml - - kubernetes: - remote_identity: SERVICE_ACCOUNT # Will automatically create the service account and roles From 1b559d2f6583e625c7e1a7c59854819544399e87 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 25 Mar 2024 15:42:40 -0700 Subject: [PATCH 33/85] minor docs --- docs/source/cloud-setup/cloud-permissions/kubernetes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index 87c1b747689..dbb1504d30a 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -16,8 +16,8 @@ SkyPilot can operate using either of the following three authentication methods: .. note:: - If your cluster uses exec based authentication in your ``~/.kube/config`` file, - SkyPilot may not be able to authenticate using this method. In this case, + If your cluster uses exec based authentication in your ``~/.kube/config`` file + (e.g., GKE uses exec auth by default), SkyPilot may not be able to authenticate using this method. In this case, consider using the service account methods below. 2. **Creating a service account**: SkyPilot can automatically create the service From cda27b28b5ea9d2793de9837a44345b2fa53628a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 26 Mar 2024 10:48:05 -0700 Subject: [PATCH 34/85] update comment --- sky/backends/backend_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 825fd223b5c..593dc7d6fef 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2723,7 +2723,8 @@ def get_endpoints( with ux_utils.print_exception_no_traceback(): raise ValueError('Cluster does not have any ports ' 'to be exposed.') - # Else wait for the ports to be exposed + # Else ports have not been exposed even though they exist. + # In this case, ask the user to retry. else: error_msg = (f'No endpoints exposed yet. ' f'{_ENDPOINTS_RETRY_MESSAGE} ') From 23d730bfe1fce9b9ab50d0ffded4d40a9b41e2cc Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 11:06:09 -0700 Subject: [PATCH 35/85] is_same_cloud to cloud_in_list --- sky/utils/controller_utils.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 93db5ee4483..749a5089a80 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -139,18 +139,14 @@ def _get_cloud_dependencies_installation_commands( # TODO(tian): Make dependency installation command a method of cloud # class and get all installation command for enabled clouds. # AWS - if any( - cloud.is_same_cloud(clouds.AWS()) - for cloud in enabled_clouds): + if clouds.cloud_in_list(clouds.AWS(), enabled_clouds): commands.append( 'pip list | grep boto3 > /dev/null 2>&1 || ' 'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 ' 'boto3>=1.26.1 > /dev/null 2>&1' ) # GCP - if any( - cloud.is_same_cloud(clouds.GCP()) - for cloud in enabled_clouds): + if clouds.cloud_in_list(clouds.GCP(), enabled_clouds): commands.extend( ['pip list | grep google-api-python-client > /dev/null 2>&1 || ' 'pip install google-api-python-client>=2.69.0 google-cloud-storage ' @@ -158,17 +154,13 @@ def _get_cloud_dependencies_installation_commands( f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}'] ) # Azure - if any( - cloud.is_same_cloud(clouds.Azure()) - for cloud in global_user_state.get_enabled_clouds()): + if clouds.cloud_in_list(clouds.Azure(), enabled_clouds): commands.append( 'pip list | grep azure-cli > /dev/null 2>&1 || ' 'pip install azure-cli>=2.31.0 azure-core azure-identity>=1.13.0 ' 'azure-mgmt-network > /dev/null 2>&1') # Kubernetes - if any( - cloud.is_same_cloud(clouds.Kubernetes()) - for cloud in global_user_state.get_enabled_clouds()): + if clouds.cloud_in_list(clouds.Kubernetes(), enabled_clouds): commands.append( # Install k8s + skypilot dependencies 'sudo bash -c "if ' @@ -184,9 +176,7 @@ def _get_cloud_dependencies_installation_commands( if controller_type == 'spot': # oci doesn't support open port yet, so we don't install oci # dependencies for sky serve controller. - if any( - cloud.is_same_cloud(clouds.OCI()) - for cloud in enabled_clouds): + if clouds.cloud_in_list(clouds.OCI(), enabled_clouds): commands.append( 'pip list | grep oci > /dev/null 2>&1 || ' 'pip install oci > /dev/null 2>&1' From cba92e37f2ed352f81242b099459c0503ed1dad6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 13:29:49 -0700 Subject: [PATCH 36/85] refactor query_ports to use head_ip --- sky/backends/backend_utils.py | 18 +++++++++++------- sky/cli.py | 2 +- sky/provision/__init__.py | 5 +++++ sky/provision/aws/instance.py | 8 +++----- sky/provision/azure/__init__.py | 1 + sky/provision/azure/instance.py | 12 ++++++++++++ sky/provision/common.py | 14 ++++---------- sky/provision/gcp/instance.py | 8 +++----- sky/provision/kubernetes/network.py | 2 ++ sky/serve/core.py | 2 +- sky/serve/replica_managers.py | 2 +- sky/serve/serve_utils.py | 2 +- 12 files changed, 45 insertions(+), 31 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 593dc7d6fef..f61c6ede9e6 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2644,14 +2644,16 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, def get_endpoints( cluster: str, endpoint: Optional[Union[int, - str]] = None) -> Union[str, Dict[int, str]]: + str]] = None) -> Dict[int, str]: """Gets the endpoint for a given cluster and port number (endpoint). Args: cluster: The name of the cluster. - endpoint: The port number to get the endpoint for. If None, all + endpoint: The port number to get the endpoint for. If None, ports for + all endpoints are returned. - Returns: Endpoint URL if endpoint is not None, else a dictionary of all + Returns: A dictionary of port numbers to endpoints. If endpoint is None, + the dictionary will contain all ports:endpoints exposed on the cluster. Raises: ValueError: if the cluster is not UP or the endpoint is not exposed. @@ -2694,10 +2696,12 @@ def get_endpoints( port_details = provision_lib.query_ports(repr(cloud), handle.cluster_name_on_cloud, handle.launched_resources.ports, - config['provider']) + head_ip=handle.head_ip, + provider_config=config['provider']) + # Validation before returning the endpoints if endpoint is not None: - # If cluster had no ports to be exposed + # If the requested endpoint was not to be exposed port_set = resources_utils.port_ranges_to_set( handle.launched_resources.ports) if endpoint not in port_set: @@ -2705,7 +2709,7 @@ def get_endpoints( raise ValueError(f'Port {endpoint} is not exposed ' 'on cluster ' f'{cluster_record["name"]!r}.') - # If the user requested a specific port endpoint + # If the user requested a specific port endpoint, check if it is exposed if endpoint not in port_details: error_msg = (f'Port {endpoint} not exposed yet. ' f'{_ENDPOINTS_RETRY_MESSAGE} ') @@ -2715,7 +2719,7 @@ def get_endpoints( error_msg += (kubernetes_utils.get_endpoint_debug_message()) with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return port_details[endpoint][0].url() + return {endpoint: port_details[endpoint][0].url()} else: if not port_details: # If cluster had no ports to be exposed diff --git a/sky/cli.py b/sky/cli.py index 69fc7d0d531..f4c5e87d1ea 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1962,7 +1962,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, if show_endpoints: if endpoint: cluster_endpoint = backend_utils.get_endpoints( - cluster_record['name'], endpoint) + cluster_record['name'], endpoint)[endpoint] click.echo(cluster_endpoint) else: cluster_endpoints = backend_utils.get_endpoints( diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 9dc73a54a53..58a05f04550 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -141,10 +141,15 @@ def query_ports( provider_name: str, cluster_name_on_cloud: str, ports: List[str], + head_ip: Optional[str] = None, provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """Query details about ports on a cluster. + If head_ip is provided, it may be used by the cloud implementation to + return the endpoint without querying the cloud provider. If head_ip is not + provider, the cloud provider will be queried to get the endpoint info. + Returns a dict with port as the key and a list of common.Endpoint. """ raise NotImplementedError diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index 387894f0044..16364dcc4d4 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -873,11 +873,9 @@ def get_cluster_info( def query_ports( cluster_name_on_cloud: str, ports: List[str], + head_ip: Optional[str] = None, provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" - assert provider_config is not None - cluster_info = get_cluster_info(provider_config['region'], - cluster_name_on_cloud, - provider_config=provider_config) - return common.query_ports_passthrough(ports, cluster_info) + del provider_config, cluster_name_on_cloud # unused + return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/provision/azure/__init__.py b/sky/provision/azure/__init__.py index b83dbb462d9..9c87fc907db 100644 --- a/sky/provision/azure/__init__.py +++ b/sky/provision/azure/__init__.py @@ -2,3 +2,4 @@ from sky.provision.azure.instance import cleanup_ports from sky.provision.azure.instance import open_ports +from sky.provision.azure.instance import query_ports diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index e0b3937b486..e34cdffdba6 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -4,6 +4,7 @@ from sky import sky_logging from sky.adaptors import azure +from sky.provision import common from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) @@ -93,3 +94,14 @@ def cleanup_ports( # Azure will automatically cleanup network security groups when cleanup # resource group. So we don't need to do anything here. del cluster_name_on_cloud, ports, provider_config # Unused. + + +def query_ports( + cluster_name_on_cloud: str, + ports: List[str], + head_ip: Optional[str] = None, + provider_config: Optional[Dict[str, Any]] = None, +) -> Dict[int, List[common.Endpoint]]: + """See sky/provision/__init__.py""" + del provider_config, cluster_name_on_cloud # unused + return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/provision/common.py b/sky/provision/common.py index 32c5c3abbfb..e6dcdb85212 100644 --- a/sky/provision/common.py +++ b/sky/provision/common.py @@ -238,17 +238,11 @@ def url(self, override_ip: Optional[str] = None) -> str: def query_ports_passthrough( ports: List[str], - cluster_info: ClusterInfo, + head_ip: Optional[str], ) -> Dict[int, List[Endpoint]]: - """Common function to query ports for AWS, GCP and Azure. - - Returns a list of socket endpoint with empty host and the input ports.""" - assert cluster_info.head_instance_id is not None, cluster_info - head_instance = cluster_info.instances.get(cluster_info.head_instance_id) - if head_instance is None: - return {} - head_ip = head_instance[0].external_ip - assert head_ip is not None, head_instance + """Common function to get endpoints for AWS, GCP and Azure. + + Returns a list of socket endpoint using head_ip and ports.""" ports = list(resources_utils.port_ranges_to_set(ports)) result: Dict[int, List[Endpoint]] = {} for port in ports: diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py index 20ef0b6c78d..95d95209787 100644 --- a/sky/provision/gcp/instance.py +++ b/sky/provision/gcp/instance.py @@ -620,11 +620,9 @@ def cleanup_ports( def query_ports( cluster_name_on_cloud: str, ports: List[str], + head_ip: Optional[str] = None, provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" - assert provider_config is not None - cluster_info = get_cluster_info(provider_config['region'], - cluster_name_on_cloud, - provider_config=provider_config) - return common.query_ports_passthrough(ports, cluster_info) + del provider_config, cluster_name_on_cloud # unused + return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 8bc628e6d8d..8e8c9bbc351 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -154,9 +154,11 @@ def _cleanup_ports_for_ingress( def query_ports( cluster_name_on_cloud: str, ports: List[str], + head_ip: Optional[str] = None, provider_config: Optional[Dict[str, Any]] = None, ) -> Dict[int, List[common.Endpoint]]: """See sky/provision/__init__.py""" + del head_ip # unused assert provider_config is not None, 'provider_config is required' port_mode = network_utils.get_port_mode( provider_config.get('port_mode', None)) diff --git a/sky/serve/core.py b/sky/serve/core.py index 7c33a24e4df..efa65967c86 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -267,7 +267,7 @@ def up( lb_port = serve_utils.load_service_initialization_result( lb_port_payload) endpoint = backend_utils.get_endpoints( - controller_handle.cluster_name, lb_port) + controller_handle.cluster_name, lb_port)[lb_port] sky_logging.print( f'{fore.CYAN}Service name: ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 401cc22852f..397ec49a98b 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -420,7 +420,7 @@ def url(self) -> Optional[str]: return None try: endpoint = backend_utils.get_endpoints(handle.cluster_name, - self.replica_port) + self.replica_port)[self.replica_port] assert isinstance(endpoint, str) return endpoint except RuntimeError: diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 61c095f8d16..e61da11757a 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -691,7 +691,7 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: return '-' try: endpoint = backend_utils.get_endpoints(handle.cluster_name, - load_balancer_port) + load_balancer_port)[load_balancer_port] assert isinstance(endpoint, str) except RuntimeError: return '-' From 789cefe8edaa6f796da8cd1cd2c9aed3e1f870c5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 14:13:45 -0700 Subject: [PATCH 37/85] autodown + http prefixing in callers --- sky/serve/core.py | 12 +++--------- sky/serve/load_balancer.py | 4 ---- sky/serve/replica_managers.py | 8 +++++--- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/sky/serve/core.py b/sky/serve/core.py index efa65967c86..a671f96f833 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -192,20 +192,14 @@ def up( # whether the service is already running. If the id is the same # with the current job id, we know the service is up and running # for the first time; otherwise it is a name conflict. - # TODO(romilb): THIS NEEDS TO BE FIXED. If the user doesn't specify a cloud - # for the controller in their ~/.sky/config.yaml, this idle_minutes_to_autostop - # will be None and provisioning would fail on k8s with unsupported feature error. - idle_minutes_to_autostop = None - # if ( - # controller_cloud and - # controller_cloud.is_same_cloud(clouds.Kubernetes()) - # ) else constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP + idle_minutes_to_autodown = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP controller_job_id, controller_handle = sky.launch( task=controller_task, stream_logs=False, cluster_name=controller_name, detach_run=True, - idle_minutes_to_autostop=idle_minutes_to_autostop, + idle_minutes_to_autostop=idle_minutes_to_autodown, + down=True, retry_until_up=True, _disable_controller_check=True, ) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 13da18cc9cb..47f71ad4ec7 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -87,10 +87,6 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_NAME]" ' 'to check the replica status.') - # If replica doesn't start with http or https, add http:// - if not ready_replica_url.startswith('http'): - ready_replica_url = 'http://' + ready_replica_url - path = f'{ready_replica_url}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 397ec49a98b..62e5cc32d78 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -419,9 +419,13 @@ def url(self) -> Optional[str]: if handle is None: return None try: + replica_port_int = int(self.replica_port) endpoint = backend_utils.get_endpoints(handle.cluster_name, - self.replica_port)[self.replica_port] + replica_port_int)[replica_port_int] assert isinstance(endpoint, str) + # If replica doesn't start with http or https, add http:// + if not endpoint.startswith('http'): + endpoint = 'http://' + endpoint return endpoint except RuntimeError: return None @@ -474,8 +478,6 @@ def probe( logger.info(f'Error when probing {replica_identity}: ' 'Cannot get the endpoint.') return self, False, probe_time - elif not url.startswith('http://'): - url = f'http://{url}' readiness_path = (f'{url}{readiness_path}') logger.info(f'Probing {replica_identity} with {readiness_path}.') if post_data is not None: From 55b63debb142d62089afcb9f2784ce91dc76e118 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 15:20:59 -0700 Subject: [PATCH 38/85] fix ssh key issues when user hash is reused --- sky/authentication.py | 2 +- sky/clouds/kubernetes.py | 10 +++++++-- sky/templates/sky-serve-controller.yaml.j2 | 3 --- sky/utils/common_utils.py | 26 +++++++++++++++------- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/sky/authentication.py b/sky/authentication.py index f98037b8885..099c0f62284 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -402,7 +402,7 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]: # Add the user's public key to the SkyPilot cluster. public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH) secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME - secret_field_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_FIELD_NAME + secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name namespace = kubernetes_utils.get_current_kube_config_context_namespace() k8s = kubernetes.get_kubernetes() with open(public_key_path, 'r', encoding='utf-8') as f: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 047f79b4781..1c33a9fd648 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -32,8 +32,6 @@ class Kubernetes(clouds.Cloud): """Kubernetes.""" SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys' - SKY_SSH_KEY_SECRET_FIELD_NAME = \ - f'ssh-publickey-{common_utils.get_user_hash()}' SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod' SKY_DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account' PORT_FORWARD_PROXY_CMD_TEMPLATE = \ @@ -78,6 +76,14 @@ class Kubernetes(clouds.Cloud): PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT STATUS_VERSION = clouds.StatusVersion.SKYPILOT + @property + def ssh_key_secret_field_name(self): + # Use a fresh user hash to avoid conflicts in the secret object naming. + # This can happen when the controller is reusing the same user hash + # through USER_ID_ENV_VAR but has a different SSH key. + fresh_user_hash = common_utils.get_user_hash(force_fresh_hash=True) + return f'ssh-publickey-{fresh_user_hash}' + @classmethod def _unsupported_features_for_resources( cls, resources: 'resources_lib.Resources' diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 17e143da9b4..f9ec53e3697 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -18,9 +18,6 @@ file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} {{remote_user_config_path}}: skypilot:local_skypilot_config_path ~/.sky/catalogs: ~/.sky/catalogs - # TODO(romilb): Fix before merging. - ~/.ssh/sky-key: ~/.ssh/sky-key - ~/.ssh/sky-key.pub: ~/.ssh/sky-key.pub run: | # Start sky serve service. diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 28fcfaf0c59..14b1b6ccb3f 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -61,11 +61,17 @@ def get_usage_run_id() -> str: return _usage_run_id -def get_user_hash() -> str: +def get_user_hash(force_fresh_hash: bool = False) -> str: """Returns a unique user-machine specific hash as a user id. We cache the user hash in a file to avoid potential user_name or hostname changes causing a new user hash to be generated. + + Args: + force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the + hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash + to be generated. + """ def _is_valid_user_hash(user_hash: Optional[str]) -> bool: @@ -77,12 +83,13 @@ def _is_valid_user_hash(user_hash: Optional[str]) -> bool: return False return len(user_hash) == USER_HASH_LENGTH - user_hash = os.getenv(constants.USER_ID_ENV_VAR) - if _is_valid_user_hash(user_hash): - assert user_hash is not None - return user_hash + if not force_fresh_hash: + user_hash = os.getenv(constants.USER_ID_ENV_VAR) + if _is_valid_user_hash(user_hash): + assert user_hash is not None + return user_hash - if os.path.exists(_USER_HASH_FILE): + if not force_fresh_hash and os.path.exists(_USER_HASH_FILE): # Read from cached user hash file. with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f: # Remove invalid characters. @@ -96,8 +103,11 @@ def _is_valid_user_hash(user_hash: Optional[str]) -> bool: # A fallback in case the hash is invalid. user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH] os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True) - with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f: - f.write(user_hash) + if not force_fresh_hash: + # Do not cache to file if force_fresh_hash is True since the file may + # be intentionally using a different hash. + with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f: + f.write(user_hash) return user_hash From 73ad2e60383ba08762fb420a7e214f3e79fb5689 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 15:37:27 -0700 Subject: [PATCH 39/85] linting --- sky/adaptors/kubernetes.py | 7 +++-- sky/backends/backend_utils.py | 12 ++++----- sky/cli.py | 3 ++- sky/clouds/kubernetes.py | 11 +++----- sky/provision/common.py | 1 + sky/provision/kubernetes/config.py | 43 +++++++++++++++++++++--------- sky/provision/kubernetes/utils.py | 32 ++++++++++++++-------- sky/serve/core.py | 1 - sky/serve/load_balancer.py | 10 ++++--- sky/serve/replica_managers.py | 4 +-- sky/serve/serve_utils.py | 4 +-- sky/utils/controller_utils.py | 25 +++++++++-------- sky/utils/schemas.py | 1 + 13 files changed, 90 insertions(+), 64 deletions(-) diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py index 26f9ac1d121..6b52afd3752 100644 --- a/sky/adaptors/kubernetes.py +++ b/sky/adaptors/kubernetes.py @@ -79,10 +79,9 @@ def _load_config(): ' If you were running a local Kubernetes ' 'cluster, run `sky local up` to start the cluster.') else: - err_str = ( - 'Failed to load Kubernetes configuration. ' - 'Please check if your kubeconfig file exists at ' - f'~/.kube/config and is valid.{suffix}') + err_str = ('Failed to load Kubernetes configuration. ' + 'Please check if your kubeconfig file exists at ' + f'~/.kube/config and is valid.{suffix}') err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.' with ux_utils.print_exception_no_traceback(): raise ValueError(err_str) from None diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index f61c6ede9e6..d98b5d73ada 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -48,10 +48,10 @@ from sky.utils import env_options from sky.utils import resources_utils from sky.utils import rich_utils +from sky.utils import schemas from sky.utils import subprocess_utils from sky.utils import timeline from sky.utils import ux_utils -from sky.utils import schemas if typing.TYPE_CHECKING: from sky import resources @@ -801,7 +801,8 @@ def write_cluster_config( assert cluster_name is not None excluded_clouds = [] remote_identity = skypilot_config.get_nested( - (str(cloud).lower(), 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + (str(cloud).lower(), 'remote_identity'), + schemas.REMOTE_IDENTITY_DEFAULT) # For Kubernetes, remote_identity can be 'SERVICE_ACCOUNT', # 'LOCAL_CREDENTIALS' or a string for the service account to use. if remote_identity != 'LOCAL_CREDENTIALS': @@ -2641,10 +2642,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, f'\n--- Details ---\n{stderr.strip()}\n') -def get_endpoints( - cluster: str, - endpoint: Optional[Union[int, - str]] = None) -> Dict[int, str]: +def get_endpoints(cluster: str, + endpoint: Optional[Union[int, str]] = None) -> Dict[int, str]: """Gets the endpoint for a given cluster and port number (endpoint). Args: @@ -2670,7 +2669,6 @@ def get_endpoints( cluster_records = get_clusters(include_controller=True, refresh=False, cluster_names=[cluster]) - #TODO(romilb): Add error message for > 1 cluster records here before merging. cluster_record = cluster_records[0] if cluster_record['status'] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): diff --git a/sky/cli.py b/sky/cli.py index f4c5e87d1ea..a2fb1a4c9fa 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3014,7 +3014,8 @@ def _down_or_stop_clusters( hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] try: hint_or_raise(controller_name) - except (exceptions.ClusterOwnerIdentityMismatchError, RuntimeError) as e: + except (exceptions.ClusterOwnerIdentityMismatchError, + RuntimeError) as e: if purge: click.echo(common_utils.format_exception(e)) else: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 1c33a9fd648..61d7a461602 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -15,7 +15,6 @@ from sky.utils import resources_utils from sky.utils import schemas - if typing.TYPE_CHECKING: # Renaming to avoid shadowing variables. from sky import resources as resources_lib @@ -269,9 +268,10 @@ def make_deploy_resources_variables( port_mode = network_utils.get_port_mode(None) - remote_identity = skypilot_config.get_nested(('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + remote_identity = skypilot_config.get_nested( + ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) if remote_identity == 'LOCAL_CREDENTIALS': - # SA name doesn't matter since automounting credentials will be turned off + # SA name doesn't matter since automounting credentials is disabled k8s_service_account_name = 'default' k8s_automount_service_account_token = 'false' elif remote_identity == 'SERVICE_ACCOUNT': @@ -283,7 +283,6 @@ def make_deploy_resources_variables( k8s_service_account_name = remote_identity k8s_automount_service_account_token = 'true' - deploy_vars = { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, @@ -382,9 +381,7 @@ def get_credential_file_mounts(self) -> Dict[str, str]: if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)): # Upload kubeconfig to the default path to avoid having to set # KUBECONFIG in the environment. - return { - DEFAULT_KUBECONFIG_PATH: CREDENTIAL_PATH - } + return {DEFAULT_KUBECONFIG_PATH: CREDENTIAL_PATH} else: return {} diff --git a/sky/provision/common.py b/sky/provision/common.py index e6dcdb85212..dbcb9e659e6 100644 --- a/sky/provision/common.py +++ b/sky/provision/common.py @@ -243,6 +243,7 @@ def query_ports_passthrough( """Common function to get endpoints for AWS, GCP and Azure. Returns a list of socket endpoint using head_ip and ports.""" + assert head_ip is not None, head_ip ports = list(resources_utils.port_ranges_to_set(ports)) result: Dict[int, List[Endpoint]] = {} for port in ports: diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 605e164142c..43786f5bdd4 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -32,23 +32,36 @@ def bootstrap_instances( # If not, set up the roles and bindings for skypilot-service-account # here. _configure_autoscaler_service_account(namespace, config.provider_config) - _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_role') - _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_role_binding') + _configure_autoscaler_role(namespace, + config.provider_config, + role_field='autoscaler_role') + _configure_autoscaler_role_binding( + namespace, + config.provider_config, + binding_field='autoscaler_role_binding') _configure_autoscaler_cluster_role(namespace, config.provider_config) - _configure_autoscaler_cluster_role_binding(namespace, config.provider_config) + _configure_autoscaler_cluster_role_binding(namespace, + config.provider_config) if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress': - logger.info('Port mode is set to ingress, setting up ingress role and role binding') + logger.info('Port mode is set to ingress, setting up ingress role ' + 'and role binding.') try: - _configure_autoscaler_role(namespace, config.provider_config, role_field='autoscaler_ingress_role') - _configure_autoscaler_role_binding(namespace, config.provider_config, binding_field='autoscaler_ingress_role_binding') + _configure_autoscaler_role(namespace, + config.provider_config, + role_field='autoscaler_ingress_role') + _configure_autoscaler_role_binding( + namespace, + config.provider_config, + binding_field='autoscaler_ingress_role_binding') except kubernetes.api_exception() as e: # If namespace is not found, we will ignore the error if e.status == 404: - logger.info(f'Namespace not found - is your nginx ingress installed? Skipping ingress role and role binding setup.') + logger.info( + 'Namespace not found - is your nginx ingress installed?' + ' Skipping ingress role and role binding setup.') else: raise e - elif requested_service_account != 'default': logger.info(f'Using service account {requested_service_account!r}, ' 'skipping role and role binding setup.') @@ -230,8 +243,7 @@ def _configure_autoscaler_service_account( f'{created_msg(account_field, name)}') -def _configure_autoscaler_role(namespace: str, - provider_config: Dict[str, Any], +def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any], role_field: str) -> None: """ Reads the role from the provider config, creates if it does not exist. @@ -315,7 +327,9 @@ def _configure_autoscaler_role_binding(namespace: str, logger.info('_configure_autoscaler_role_binding: ' f'{created_msg(binding_field, name)}') -def _configure_autoscaler_cluster_role(namespace, provider_config: Dict[str, Any]) -> None: + +def _configure_autoscaler_cluster_role(namespace, + provider_config: Dict[str, Any]) -> None: role_field = 'autoscaler_cluster_role' if role_field not in provider_config: logger.info('_configure_autoscaler_cluster_role: ' @@ -341,10 +355,12 @@ def _configure_autoscaler_cluster_role(namespace, provider_config: Dict[str, Any logger.info('_configure_autoscaler_cluster_role: ' f'{not_found_msg(role_field, name)}') kubernetes.auth_api().create_cluster_role(role) - logger.info(f'_configure_autoscaler_cluster_role: {created_msg(role_field, name)}') + logger.info( + f'_configure_autoscaler_cluster_role: {created_msg(role_field, name)}') -def _configure_autoscaler_cluster_role_binding(namespace, provider_config: Dict[str, Any]) -> None: +def _configure_autoscaler_cluster_role_binding( + namespace, provider_config: Dict[str, Any]) -> None: binding_field = 'autoscaler_cluster_role_binding' if binding_field not in provider_config: logger.info('_configure_autoscaler_cluster_role_binding: ' @@ -380,6 +396,7 @@ def _configure_autoscaler_cluster_role_binding(namespace, provider_config: Dict[ logger.info('_configure_autoscaler_cluster_role_binding: ' f'{created_msg(binding_field, name)}') + def _configure_ssh_jump(namespace, config: common.ProvisionConfig): """Creates a SSH jump pod to connect to the cluster. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 1e2b3ddcea6..35cb779a1cb 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -521,7 +521,7 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ try: k8s.config.load_kube_config() except kubernetes.config_exception(): - pass # Using service account token or other auth methods, continue + pass # Using service account token or other auth methods, continue else: # Get active context and user from kubeconfig using k8s api _, current_context = k8s.config.list_kube_config_contexts() @@ -530,26 +530,35 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ # K8s api does not provide a mechanism to get the user details from the # context. We need to load the kubeconfig file and parse it to get the # user details. - kubeconfig_path = os.path.expanduser(os.getenv('KUBECONFIG', - k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION)) + kubeconfig_path = os.path.expanduser( + os.getenv('KUBECONFIG', + k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION)) # Load the kubeconfig file as a dictionary - with open(kubeconfig_path, 'r') as f: + with open(kubeconfig_path, 'r', encoding='utf-8') as f: kubeconfig = yaml.safe_load(f) user_details = kubeconfig['users'] # Find user matching the target username - user_details = next(user for user in user_details if user['name'] == target_username) + user_details = next( + user for user in user_details if user['name'] == target_username) - remote_identity = skypilot_config.get_nested(('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + remote_identity = skypilot_config.get_nested( + ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) if ('exec' in user_details.get('user', {}) and remote_identity == 'LOCAL_CREDENTIALS'): ctx_name = current_context['name'] - exec_msg = ('exec-based authentication is used for ' - f'Kubernetes context {ctx_name!r}.' - ' This may cause issues when running Managed Spot ' - 'or SkyServe controller on Kubernetes. To fix, configure SkyPilot to create a service account for running pods by adding ' - 'the following in ~/.sky/config.yaml:\n kubernetes:\n remote_identity: SERVICE_ACCOUNT\n More: https://skypilot.readthedocs.io/en/latest/reference/config.html') + exec_msg = ( + 'exec-based authentication is used for ' + f'Kubernetes context {ctx_name!r}.' + ' This may cause issues when running Managed Spot ' + 'or SkyServe controller on Kubernetes. To fix, configure ' + 'SkyPilot to create a service account for running pods by ' + 'adding the following in ~/.sky/config.yaml:\n' + ' kubernetes:\n' + ' remote_identity: SERVICE_ACCOUNT\n' + ' More: https://skypilot.readthedocs.io/en/latest/' + 'reference/config.html') # We now check if GPUs are available and labels are set correctly on the # cluster, and if not we return hints that may help debug any issues. @@ -571,6 +580,7 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ else: return True, None + def get_current_kube_config_context_name() -> Optional[str]: """Get the current kubernetes context from the kubeconfig file diff --git a/sky/serve/core.py b/sky/serve/core.py index a671f96f833..ef8828a7933 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -7,7 +7,6 @@ import sky from sky import backends -from sky import clouds from sky import exceptions from sky import global_user_state from sky import sky_logging diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 47f71ad4ec7..7864e242148 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -115,14 +115,18 @@ def run_load_balancer(controller_addr: str, load_balancer_port: int): load_balancer_port=load_balancer_port) load_balancer.run() + if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() - parser.add_argument('--controller-addr', required=True, + parser.add_argument('--controller-addr', + required=True, default='127.0.0.1', help='The address of the controller.') - parser.add_argument('--load-balancer-port', type=int, required=True, + parser.add_argument('--load-balancer-port', + type=int, + required=True, default=8890, help='The port where the load balancer listens to.') args = parser.parse_args() - run_load_balancer(args.controller_addr, args.load_balancer_port) \ No newline at end of file + run_load_balancer(args.controller_addr, args.load_balancer_port) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 62e5cc32d78..2cd5565cbb4 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -420,8 +420,8 @@ def url(self) -> Optional[str]: return None try: replica_port_int = int(self.replica_port) - endpoint = backend_utils.get_endpoints(handle.cluster_name, - replica_port_int)[replica_port_int] + endpoint = backend_utils.get_endpoints( + handle.cluster_name, replica_port_int)[replica_port_int] assert isinstance(endpoint, str) # If replica doesn't start with http or https, add http:// if not endpoint.startswith('http'): diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e61da11757a..6dd73335bec 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -690,8 +690,8 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = backend_utils.get_endpoints(handle.cluster_name, - load_balancer_port)[load_balancer_port] + endpoint = backend_utils.get_endpoints( + handle.cluster_name, load_balancer_port)[load_balancer_port] assert isinstance(endpoint, str) except RuntimeError: return '-' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 749a5089a80..f9e49dad3d6 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -143,16 +143,14 @@ def _get_cloud_dependencies_installation_commands( commands.append( 'pip list | grep boto3 > /dev/null 2>&1 || ' 'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 ' - 'boto3>=1.26.1 > /dev/null 2>&1' - ) + 'boto3>=1.26.1 > /dev/null 2>&1') # GCP if clouds.cloud_in_list(clouds.GCP(), enabled_clouds): - commands.extend( - ['pip list | grep google-api-python-client > /dev/null 2>&1 || ' + commands.extend([ + 'pip list | grep google-api-python-client > /dev/null 2>&1 || ' 'pip install google-api-python-client>=2.69.0 google-cloud-storage ' - '> /dev/null 2>&1', - f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}'] - ) + '> /dev/null 2>&1', f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}' + ]) # Azure if clouds.cloud_in_list(clouds.Azure(), enabled_clouds): commands.append( @@ -170,17 +168,18 @@ def _get_cloud_dependencies_installation_commands( 'then apt update && apt install curl socat netcat -y; ' 'fi" && ' # Install kubectl - '(command -v kubectl &>/dev/null || (curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl)) && ' - ) + '(command -v kubectl &>/dev/null || ' + '(curl -LO "https://dl.k8s.io/release/$(curl -L -s ' + 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' + 'sudo install -o root -g root -m 0755 ' + 'kubectl /usr/local/bin/kubectl)) && ') # OCI if controller_type == 'spot': # oci doesn't support open port yet, so we don't install oci # dependencies for sky serve controller. if clouds.cloud_in_list(clouds.OCI(), enabled_clouds): - commands.append( - 'pip list | grep oci > /dev/null 2>&1 || ' - 'pip install oci > /dev/null 2>&1' - ) + commands.append('pip list | grep oci > /dev/null 2>&1 || ' + 'pip install oci > /dev/null 2>&1') # ibm doesn't support open port and spot instance yet, so we don't # install them for either controller. return commands diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index d54133e5cbb..f91636bd037 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -536,6 +536,7 @@ def get_cluster_schema(): REMOTE_IDENTITY_DEFAULT = 'LOCAL_CREDENTIALS' + def get_config_schema(): # pylint: disable=import-outside-toplevel from sky.utils import kubernetes_enums From 31418d259c40ca447ae6cd4975a72230726015c8 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 15:38:25 -0700 Subject: [PATCH 40/85] lint --- sky/clouds/kubernetes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 61d7a461602..f7b31ce74df 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -300,7 +300,8 @@ def make_deploy_resources_variables( 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, 'k8s_ssh_jump_image': ssh_jump_image, 'k8s_service_account_name': k8s_service_account_name, - 'k8s_automount_service_account_token': k8s_automount_service_account_token, + 'k8s_automount_service_account_token': + k8s_automount_service_account_token, 'image_id': image_id, } From 0684f2b4e5f44fb7ff1b77aa411434e338fa5b00 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 16:49:21 -0700 Subject: [PATCH 41/85] lint, HOST_CONTROLLERS --- sky/clouds/cloud.py | 1 + sky/clouds/kubernetes.py | 14 ++-- sky/execution.py | 5 ++ sky/provision/kubernetes/utils.py | 106 ++++++++++++++++------------ sky/templates/kubernetes-ray.yml.j2 | 2 +- 5 files changed, 78 insertions(+), 50 deletions(-) diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index ed1ff0235bb..ea01ffda7ff 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -41,6 +41,7 @@ class CloudImplementationFeatures(enum.Enum): SPOT_INSTANCE = 'spot_instance' CUSTOM_DISK_TIER = 'custom_disk_tier' OPEN_PORTS = 'open_ports' + HOST_CONTROLLERS = 'host_controllers' class Region(collections.namedtuple('Region', ['name'])): diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index f7b31ce74df..410efb29574 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -98,6 +98,11 @@ def _unsupported_features_for_resources( clouds.CloudImplementationFeatures.OPEN_PORTS] = ( 'Opening ports is not supported in Kubernetes when ' 'using local kind cluster.') + + is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth() + if is_exec_auth: + unsupported_features[ + clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message return unsupported_features @classmethod @@ -273,15 +278,15 @@ def make_deploy_resources_variables( if remote_identity == 'LOCAL_CREDENTIALS': # SA name doesn't matter since automounting credentials is disabled k8s_service_account_name = 'default' - k8s_automount_service_account_token = 'false' + k8s_automount_sa_token = 'false' elif remote_identity == 'SERVICE_ACCOUNT': # Use the default service account k8s_service_account_name = self.SKY_DEFAULT_SERVICE_ACCOUNT_NAME - k8s_automount_service_account_token = 'true' + k8s_automount_sa_token = 'true' else: # User specified a custom service account k8s_service_account_name = remote_identity - k8s_automount_service_account_token = 'true' + k8s_automount_sa_token = 'true' deploy_vars = { 'instance_type': resources.instance_type, @@ -300,8 +305,7 @@ def make_deploy_resources_variables( 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, 'k8s_ssh_jump_image': ssh_jump_image, 'k8s_service_account_name': k8s_service_account_name, - 'k8s_automount_service_account_token': - k8s_automount_service_account_token, + 'k8s_automount_sa_token': k8s_automount_sa_token, 'image_id': image_id, } diff --git a/sky/execution.py b/sky/execution.py index 4e056075d49..60d09254abf 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -233,6 +233,11 @@ def _execute( # Requested features that some clouds support and others don't. requested_features = set() + # If task is controller, request HOST_CONTROLLERS feature. + if controller_utils.Controllers.from_name(cluster_name) is not None: + requested_features.add( + clouds.CloudImplementationFeatures.HOST_CONTROLLERS) + if task.num_nodes > 1: requested_features.add(clouds.CloudImplementationFeatures.MULTI_NODE) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 35cb779a1cb..fa7c245965e 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -515,50 +515,7 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ # We now do softer checks to check if exec based auth is used and to # see if the cluster is GPU-enabled. - # Check if exec based auth is used - exec_msg = '' - k8s = kubernetes.get_kubernetes() - try: - k8s.config.load_kube_config() - except kubernetes.config_exception(): - pass # Using service account token or other auth methods, continue - else: - # Get active context and user from kubeconfig using k8s api - _, current_context = k8s.config.list_kube_config_contexts() - target_username = current_context['context']['user'] - - # K8s api does not provide a mechanism to get the user details from the - # context. We need to load the kubeconfig file and parse it to get the - # user details. - kubeconfig_path = os.path.expanduser( - os.getenv('KUBECONFIG', - k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION)) - # Load the kubeconfig file as a dictionary - with open(kubeconfig_path, 'r', encoding='utf-8') as f: - kubeconfig = yaml.safe_load(f) - - user_details = kubeconfig['users'] - - # Find user matching the target username - user_details = next( - user for user in user_details if user['name'] == target_username) - - remote_identity = skypilot_config.get_nested( - ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) - if ('exec' in user_details.get('user', {}) and - remote_identity == 'LOCAL_CREDENTIALS'): - ctx_name = current_context['name'] - exec_msg = ( - 'exec-based authentication is used for ' - f'Kubernetes context {ctx_name!r}.' - ' This may cause issues when running Managed Spot ' - 'or SkyServe controller on Kubernetes. To fix, configure ' - 'SkyPilot to create a service account for running pods by ' - 'adding the following in ~/.sky/config.yaml:\n' - ' kubernetes:\n' - ' remote_identity: SERVICE_ACCOUNT\n' - ' More: https://skypilot.readthedocs.io/en/latest/' - 'reference/config.html') + is_exec_auth, exec_msg = is_kubeconfig_exec_auth() # We now check if GPUs are available and labels are set correctly on the # cluster, and if not we return hints that may help debug any issues. @@ -581,6 +538,67 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ return True, None +def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: + """Checks if the kubeconfig file uses exec-based authentication + + Using exec-based authentication is problematic when used in conjunction + with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml. + This is because the exec-based authentication may not have the relevant + dependencies installed on the remote cluster or may have hardcoded paths + that are not available on the remote cluster. + + Returns: + bool: True if exec-based authentication is used and LOCAL_CREDENTIAL + mode is used for remote_identity in ~/.sky/config.yaml. + str: Error message if exec-based authentication is used, None otherwise + """ + k8s = kubernetes.get_kubernetes() + try: + k8s.config.load_kube_config() + except kubernetes.config_exception(): + # Using service account token or other auth methods, continue + return False, None + + # Get active context and user from kubeconfig using k8s api + _, current_context = k8s.config.list_kube_config_contexts() + target_username = current_context['context']['user'] + + # K8s api does not provide a mechanism to get the user details from the + # context. We need to load the kubeconfig file and parse it to get the + # user details. + kubeconfig_path = os.path.expanduser( + os.getenv('KUBECONFIG', + k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION)) + # Load the kubeconfig file as a dictionary + with open(kubeconfig_path, 'r', encoding='utf-8') as f: + kubeconfig = yaml.safe_load(f) + + user_details = kubeconfig['users'] + + # Find user matching the target username + user_details = next( + user for user in user_details if user['name'] == target_username) + + remote_identity = skypilot_config.get_nested( + ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + if ('exec' in user_details.get('user', {}) and + remote_identity == 'LOCAL_CREDENTIALS'): + ctx_name = current_context['name'] + exec_msg = ( + 'exec-based authentication is used for ' + f'Kubernetes context {ctx_name!r}.' + ' This may cause issues when running Managed Spot ' + 'or SkyServe controller on Kubernetes. To fix, configure ' + 'SkyPilot to create a service account for running pods by ' + 'adding the following in ~/.sky/config.yaml:\n' + ' kubernetes:\n' + ' remote_identity: SERVICE_ACCOUNT\n' + ' More: https://skypilot.readthedocs.io/en/latest/' + 'reference/config.html') + return True, exec_msg + return False, None + + def get_current_kube_config_context_name() -> Optional[str]: """Get the current kubernetes context from the kubeconfig file diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 1796c19c6fa..d0b8d72824b 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -217,7 +217,7 @@ available_node_types: spec: # serviceAccountName: skypilot-service-account serviceAccountName: {{k8s_service_account_name}} - automountServiceAccountToken: {{k8s_automount_service_account_token}} + automountServiceAccountToken: {{k8s_automount_sa_token}} restartPolicy: Never From e7f232ee4c56d90d041f9bd48a2e248d805207c2 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 17:56:36 -0700 Subject: [PATCH 42/85] add serve smoke tests for k8s --- tests/skyserve/http/kubernetes.yaml | 15 ++++++++++ tests/test_smoke.py | 43 +++++++++++++++++++---------- 2 files changed, 44 insertions(+), 14 deletions(-) create mode 100644 tests/skyserve/http/kubernetes.yaml diff --git a/tests/skyserve/http/kubernetes.yaml b/tests/skyserve/http/kubernetes.yaml new file mode 100644 index 00000000000..441d097b12b --- /dev/null +++ b/tests/skyserve/http/kubernetes.yaml @@ -0,0 +1,15 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + +resources: + ports: 8080 + cloud: kubernetes + cpus: 2+ + +workdir: examples/serve/http_server + +# Use 8080 to test jupyter service is terminated +run: python3 server.py --port 8080 diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f8dca3baa2c..5ede713a19c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2968,9 +2968,19 @@ def test_skyserve_azure_http(): run_one_test(test) +@pytest.mark.kubernetes +@pytest.mark.sky_serve +def test_skyserve_kubernetes_http(): + """Test skyserve on Kubernetes""" + name = _get_service_name() + test = _get_skyserve_http_test(name, 'kubernetes', 30) + run_one_test(test) + + @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_llm(): +def test_skyserve_llm(generic_cloud): """Test skyserve with real LLM usecase""" name = _get_service_name() @@ -2988,7 +2998,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: test = Test( f'test-skyserve-llm', [ - f'sky serve up -n {name} -y tests/skyserve/llm/service.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/llm/service.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), *[ generate_llm_test_command(prompt, output) @@ -3131,14 +3141,15 @@ def test_skyserve_spot_user_bug(): @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_load_balancer(): +def test_skyserve_load_balancer(generic_cloud): """Test skyserve load balancer round-robin policy""" name = _get_service_name() test = Test( f'test-skyserve-load-balancer', [ - f'sky serve up -n {name} -y tests/skyserve/load_balancer/service.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/load_balancer/service.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 1)}; ' f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' @@ -3192,15 +3203,16 @@ def test_skyserve_auto_restart(): @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_cancel(): +def test_skyserve_cancel(generic_cloud): """Test skyserve with cancel""" name = _get_service_name() test = Test( f'test-skyserve-cancel', [ - f'sky serve up -n {name} -y tests/skyserve/cancel/cancel.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/cancel/cancel.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_get_serve_endpoint(name)}; python3 ' 'tests/skyserve/cancel/send_cancel_request.py ' @@ -3214,14 +3226,15 @@ def test_skyserve_cancel(): @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_update(): +def test_skyserve_update(generic_cloud): """Test skyserve with update""" name = _get_service_name() test = Test( f'test-skyserve-update', [ - f'sky serve up -n {name} -y tests/skyserve/update/old.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/old.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', f'sky serve update {name} -y tests/skyserve/update/new.yaml', @@ -3237,8 +3250,9 @@ def test_skyserve_update(): @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_fast_update(): +def test_skyserve_fast_update(generic_cloud): """Test skyserve with fast update (Increment version of old replicas)""" name = _get_service_name() @@ -3255,10 +3269,10 @@ def _check_one_provisioning_in_status(name: str) -> str: test = Test( f'test-skyserve-fast-update', [ - f'sky serve up -n {name} -y tests/skyserve/update/bump_version_before.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} -y tests/skyserve/update/bump_version_after.yaml', + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_after.yaml', # sleep to wait for update to be registered. 'sleep 30', # READY for service + two READY replicas. @@ -3275,17 +3289,18 @@ def _check_one_provisioning_in_status(name: str) -> str: @pytest.mark.gcp +@pytest.mark.kubernetes @pytest.mark.sky_serve -def test_skyserve_update_autoscale(): +def test_skyserve_update_autoscale(generic_cloud): """Test skyserve update with autoscale""" name = _get_service_name() test = Test( f'test-skyserve-update-autoscale', [ - f'sky serve up -n {name} -y tests/skyserve/update/num_min_two.yaml', + f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_two.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', - f'sky serve update {name} -y tests/skyserve/update/num_min_one.yaml', + f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/num_min_one.yaml', # sleep before update is registered. 'sleep 20', # Timeout will be triggered when update fails. From 2a1b91614a6c2fb6d47864003fbab4be3caa8e21 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 18:33:07 -0700 Subject: [PATCH 43/85] disallow file_mounts and workdir if no storage cloud is enabled --- sky/utils/controller_utils.py | 23 ++++++++++++++++++++++- tests/skyserve/llm/service.yaml | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index f9e49dad3d6..96bdd39791e 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -535,7 +535,28 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # whenever task.storage_mounts is non-empty. logger.info(f'{colorama.Fore.YELLOW}Uploading sources to cloud storage.' f'{colorama.Style.RESET_ALL} See: sky storage ls') - task.sync_storage_mounts() + try: + task.sync_storage_mounts() + except ValueError as e: + if 'No enabled cloud for storage' in str(e): + data_src = None + if has_local_source_paths_file_mounts: + data_src = 'file_mounts' + if has_local_source_paths_workdir: + if data_src: + data_src += ' and workdir' + else: + data_src = 'workdir' + store_enabled_clouds = ', '.join(storage_lib.STORE_ENABLED_CLOUDS) + with (ux_utils.print_exception_no_traceback()): + raise exceptions.NotSupportedError( + f'Unable to use {data_src} - no cloud with object store ' + 'is enabled. Please enable at least one cloud with ' + f'object store support ({store_enabled_clouds}) by running ' + f'`sky check`, or remove {data_src} from your task.' + '\nHint: If you do not have any cloud access, you may ' + 'download data and code from the network using wget or ' + 'other tools in the `setup` section of the task.') from None # Step 5: Add the file download into the file mounts, such as # /original-dst: s3://spot-fm-file-only-bucket-name/file-0 diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml index 79370fcd36a..bb65bd533b6 100644 --- a/tests/skyserve/llm/service.yaml +++ b/tests/skyserve/llm/service.yaml @@ -16,7 +16,7 @@ resources: ports: 8087 cloud: gcp accelerators: T4 - memory: 32+ + memory: 20+ setup: | conda activate chatbot From f2420375c3f26c716941939dfd5d700b1f13c668 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 18:34:02 -0700 Subject: [PATCH 44/85] minor --- sky/utils/controller_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 96bdd39791e..0417fba821d 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -554,8 +554,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', 'is enabled. Please enable at least one cloud with ' f'object store support ({store_enabled_clouds}) by running ' f'`sky check`, or remove {data_src} from your task.' - '\nHint: If you do not have any cloud access, you may ' - 'download data and code from the network using wget or ' + '\nHint: If you do not have any cloud access, you may still' + ' download data and code over the network using curl or ' 'other tools in the `setup` section of the task.') from None # Step 5: Add the file download into the file mounts, such as From 9b12cf19c8f494e038222ceb337b1227438efee2 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 27 Mar 2024 18:36:25 -0700 Subject: [PATCH 45/85] lint --- sky/clouds/kubernetes.py | 1 + sky/provision/kubernetes/utils.py | 23 +++++++++++------------ sky/utils/controller_utils.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 410efb29574..aed38bff0f2 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -101,6 +101,7 @@ def _unsupported_features_for_resources( is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth() if is_exec_auth: + assert isinstance(message, str), message unsupported_features[ clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message return unsupported_features diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index fa7c245965e..d47379e14a0 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -515,7 +515,7 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ # We now do softer checks to check if exec based auth is used and to # see if the cluster is GPU-enabled. - is_exec_auth, exec_msg = is_kubeconfig_exec_auth() + _, exec_msg = is_kubeconfig_exec_auth() # We now check if GPUs are available and labels are set correctly on the # cluster, and if not we return hints that may help debug any issues. @@ -584,17 +584,16 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: if ('exec' in user_details.get('user', {}) and remote_identity == 'LOCAL_CREDENTIALS'): ctx_name = current_context['name'] - exec_msg = ( - 'exec-based authentication is used for ' - f'Kubernetes context {ctx_name!r}.' - ' This may cause issues when running Managed Spot ' - 'or SkyServe controller on Kubernetes. To fix, configure ' - 'SkyPilot to create a service account for running pods by ' - 'adding the following in ~/.sky/config.yaml:\n' - ' kubernetes:\n' - ' remote_identity: SERVICE_ACCOUNT\n' - ' More: https://skypilot.readthedocs.io/en/latest/' - 'reference/config.html') + exec_msg = ('exec-based authentication is used for ' + f'Kubernetes context {ctx_name!r}.' + ' This may cause issues when running Managed Spot ' + 'or SkyServe controller on Kubernetes. To fix, configure ' + 'SkyPilot to create a service account for running pods by ' + 'adding the following in ~/.sky/config.yaml:\n' + ' kubernetes:\n' + ' remote_identity: SERVICE_ACCOUNT\n' + ' More: https://skypilot.readthedocs.io/en/latest/' + 'reference/config.html') return True, exec_msg return False, None diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 0417fba821d..02474b39ba3 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -548,7 +548,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', else: data_src = 'workdir' store_enabled_clouds = ', '.join(storage_lib.STORE_ENABLED_CLOUDS) - with (ux_utils.print_exception_no_traceback()): + with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( f'Unable to use {data_src} - no cloud with object store ' 'is enabled. Please enable at least one cloud with ' From 3ca5d470b82b6e2e32ac7114a01081d6d12a5e51 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 28 Mar 2024 17:28:54 -0700 Subject: [PATCH 46/85] update fastchat to use --host 127.0.0.1 --- docs/source/serving/sky-serve.rst | 3 ++- examples/serve/gorilla/gorilla.yaml | 3 ++- examples/serve/vicuna-v1.5.yaml | 3 ++- llm/llama-2/chatbot-hf.yaml | 3 ++- llm/vicuna-llama-2/serve.yaml | 3 ++- llm/vicuna/serve-openai-api-endpoint.yaml | 3 ++- llm/vicuna/serve.yaml | 3 ++- tests/skyserve/llm/service.yaml | 7 +++++-- 8 files changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/source/serving/sky-serve.rst b/docs/source/serving/sky-serve.rst index eb2daa6ffb8..1c4ee3f2751 100644 --- a/docs/source/serving/sky-serve.rst +++ b/docs/source/serving/sky-serve.rst @@ -308,11 +308,12 @@ Let's bring up a real LLM chat service with FastChat + Vicuna. We'll use the `Vi conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.3 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/examples/serve/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml index ee46aa94568..e3072d816fb 100644 --- a/examples/serve/gorilla/gorilla.yaml +++ b/examples/serve/gorilla/gorilla.yaml @@ -35,11 +35,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path gorilla-llm/gorilla-falcon-7b-hf-v0 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/examples/serve/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml index c94115ea3d7..0f659e85697 100644 --- a/examples/serve/vicuna-v1.5.yaml +++ b/examples/serve/vicuna-v1.5.yaml @@ -34,11 +34,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.5 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/llm/llama-2/chatbot-hf.yaml b/llm/llama-2/chatbot-hf.yaml index 4c0132e4dd4..992c01346e6 100644 --- a/llm/llama-2/chatbot-hf.yaml +++ b/llm/llama-2/chatbot-hf.yaml @@ -24,12 +24,13 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path meta-llama/Llama-2-${MODEL_SIZE}b-chat-hf \ --num-gpus $SKYPILOT_NUM_GPUS_PER_NODE 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/llm/vicuna-llama-2/serve.yaml b/llm/vicuna-llama-2/serve.yaml index 0a98dab5d26..69f89f2fc28 100644 --- a/llm/vicuna-llama-2/serve.yaml +++ b/llm/vicuna-llama-2/serve.yaml @@ -27,11 +27,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path /skypilot-vicuna 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/llm/vicuna/serve-openai-api-endpoint.yaml b/llm/vicuna/serve-openai-api-endpoint.yaml index 247043ee3c2..639dfadc6d6 100644 --- a/llm/vicuna/serve-openai-api-endpoint.yaml +++ b/llm/vicuna/serve-openai-api-endpoint.yaml @@ -19,11 +19,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.3 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/llm/vicuna/serve.yaml b/llm/vicuna/serve.yaml index d458112a42f..49185fcea20 100644 --- a/llm/vicuna/serve.yaml +++ b/llm/vicuna/serve.yaml @@ -19,11 +19,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.3 2>&1 \ + --host 127.0.0.1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml index bb65bd533b6..48160e8f3db 100644 --- a/tests/skyserve/llm/service.yaml +++ b/tests/skyserve/llm/service.yaml @@ -7,6 +7,7 @@ service: - role: user content: How to print hello world? max_tokens: 1 + initial_delay_seconds: 1800 replicas: 1 envs: @@ -16,6 +17,7 @@ resources: ports: 8087 cloud: gcp accelerators: T4 + cpus: 7+ memory: 20+ setup: | @@ -33,11 +35,12 @@ run: | conda activate chatbot echo 'Starting controller...' - python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 & sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ - --model-path lmsys/$MODEL_NAME 2>&1 \ + --host 127.0.0.1 \ + --model-path lmsys/$MODEL_NAME 2>&1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' From 02ef50a667b71990704525e218aa3a6516a4947b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 28 Mar 2024 22:54:25 -0700 Subject: [PATCH 47/85] extend timeout --- tests/skyserve/restart/user_bug.yaml | 1 - tests/test_smoke.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/skyserve/restart/user_bug.yaml b/tests/skyserve/restart/user_bug.yaml index 547f19596a5..8c071a6122c 100644 --- a/tests/skyserve/restart/user_bug.yaml +++ b/tests/skyserve/restart/user_bug.yaml @@ -8,7 +8,6 @@ service: resources: ports: 8080 cpus: 2+ - use_spot: True workdir: tests/skyserve/spot diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 3a7a55051cd..8da562828ca 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3152,7 +3152,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: ], ], _TEARDOWN_SERVICE.format(name=name), - timeout=20 * 60, + timeout=40 * 60, ) run_one_test(test) @@ -3246,6 +3246,7 @@ def test_skyserve_dynamic_ondemand_fallback(): @pytest.mark.serve +@pytest.mark def test_skyserve_user_bug_restart(generic_cloud: str): """Tests that we restart the service after user bug.""" # TODO(zhwu): this behavior needs some rethinking. @@ -3510,6 +3511,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): @pytest.mark.serve +@pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes @pytest.mark.parametrize('mode', ['rolling', 'blue_green']) def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): """Test skyserve with update that changes autoscaler""" From 259917fd0e6bf44650ae356eb47e019cbfa59665 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Apr 2024 15:59:22 -0700 Subject: [PATCH 48/85] docs comments --- .../cloud-permissions/kubernetes.rst | 17 +++++++++-------- .../reference/kubernetes/kubernetes-setup.rst | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index dbb1504d30a..be5254c633c 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -23,7 +23,7 @@ SkyPilot can operate using either of the following three authentication methods: 2. **Creating a service account**: SkyPilot can automatically create the service account and roles for itself to manage resources in the Kubernetes cluster. To use this method, set ``remote_identity: SERVICE_ACCOUNT`` to your - Kubernetes configuration in the ``~/.sky/config.yaml`` file: + Kubernetes configuration in the :ref:`~/.sky/config.yaml ` file: .. code-block:: yaml @@ -35,7 +35,7 @@ SkyPilot can operate using either of the following three authentication methods: 3. **Using a custom service account**: If you have a custom service account with the `necessary permissions `__, you can configure - SkyPilot to use it by adding this to your ``~/.sky/config.yaml`` file: + SkyPilot to use it by adding this to your :ref:`~/.sky/config.yaml ` file: .. code-block:: yaml @@ -44,17 +44,17 @@ SkyPilot can operate using either of the following three authentication methods: .. note:: - Service account based authentication applies only when the SkyPiolt - controller is running inside the Kubernetes cluster. When running outside - the cluster (e.g., on AWS), SkyPilot will use the local ``~/.kube/config`` - file for authentication. + Service account based authentication applies only when the remote SkyPilot + cluster (including spot and serve controller) is launched inside the + Kubernetes cluster. When running outside the cluster (e.g., on AWS), + SkyPilot will use the local ``~/.kube/config`` file for authentication. Below are the permissions required by SkyPilot and an example service account YAML that you can use to create a service account with the necessary permissions. .. _k8s-permissions: -Permissions required by SkyPilot --------------------------------- +Permissions required for SkyPilot +--------------------------------- SkyPilot requires permissions equivalent to the following roles to be able to manage the resources in the Kubernetes cluster: @@ -103,6 +103,7 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma resources: ["services"] verbs: ["list", "get"] +These roles must apply to both the user account configured in the kubeconfig file and the service account used by SkyPilot (if configured). .. _k8s-sa-example: diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst index c1a4a664906..3ed1b8c89f0 100644 --- a/docs/source/reference/kubernetes/kubernetes-setup.rst +++ b/docs/source/reference/kubernetes/kubernetes-setup.rst @@ -382,7 +382,7 @@ To use this mode: # ingress-nginx-controller LoadBalancer 10.24.4.254 35.202.58.117 80:31253/TCP,443:32699/TCP .. note:: - If the ``EXTERNAL-IP`` field is ````, you may manually assign it a External IP. + If the ``EXTERNAL-IP`` field is ````, you may manually assign it an External IP. This can be done by patching the service with an IP that can be accessed from outside the cluster. If the service type is ``NodePort``, you can set the ``EXTERNAL-IP`` to any node's IP address: From 665a12e75abcfc86acd5ab49548dae35c0e3f101 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 9 Apr 2024 18:00:13 -0700 Subject: [PATCH 49/85] rename to port --- sky/backends/backend_utils.py | 28 ++++++++++++++-------------- sky/serve/replica_managers.py | 2 +- sky/serve/serve_utils.py | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 406cc62ab0a..105e2782c26 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2692,13 +2692,13 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, def get_endpoints(cluster: str, - endpoint: Optional[Union[int, str]] = None) -> Dict[int, str]: + port: Optional[Union[int, str]] = None) -> Dict[int, str]: """Gets the endpoint for a given cluster and port number (endpoint). Args: cluster: The name of the cluster. - endpoint: The port number to get the endpoint for. If None, ports for - all endpoints are returned. + port: The port number to get the endpoint for. If None, endpoints + for all ports are returned. Returns: A dictionary of port numbers to endpoints. If endpoint is None, the dictionary will contain all ports:endpoints exposed on the cluster. @@ -2709,19 +2709,19 @@ def get_endpoints(cluster: str, are exposed yet. """ # Cast endpoint to int if it is not None - if endpoint is not None: + if port is not None: try: - endpoint = int(endpoint) + port = int(port) except ValueError: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Invalid endpoint {endpoint!r}.') from None + raise ValueError(f'Invalid endpoint {port!r}.') from None cluster_records = get_clusters(include_controller=True, refresh=False, cluster_names=[cluster]) cluster_record = cluster_records[0] if cluster_record['status'] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' + raise exceptions.ClusterNotUpError(f'Cluster {cluster_record["name"]!r} ' 'is not in UP status.') handle = cluster_record['handle'] if not isinstance(handle, backends.CloudVmRayResourceHandle): @@ -2747,18 +2747,18 @@ def get_endpoints(cluster: str, provider_config=config['provider']) # Validation before returning the endpoints - if endpoint is not None: + if port is not None: # If the requested endpoint was not to be exposed port_set = resources_utils.port_ranges_to_set( handle.launched_resources.ports) - if endpoint not in port_set: + if port not in port_set: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Port {endpoint} is not exposed ' + raise ValueError(f'Port {port} is not exposed ' 'on cluster ' f'{cluster_record["name"]!r}.') # If the user requested a specific port endpoint, check if it is exposed - if endpoint not in port_details: - error_msg = (f'Port {endpoint} not exposed yet. ' + if port not in port_details: + error_msg = (f'Port {port} not exposed yet. ' f'{_ENDPOINTS_RETRY_MESSAGE} ') if handle.launched_resources.cloud.is_same_cloud( clouds.Kubernetes()): @@ -2766,7 +2766,7 @@ def get_endpoints(cluster: str, error_msg += (kubernetes_utils.get_endpoint_debug_message()) with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return {endpoint: port_details[endpoint][0].url()} + return {port: port_details[port][0].url()} else: if not port_details: # If cluster had no ports to be exposed @@ -2786,4 +2786,4 @@ def get_endpoints(cluster: str, kubernetes_utils.get_endpoint_debug_message() with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return {port: urls[0].url() for port, urls in port_details.items()} + return {port_num: urls[0].url() for port_num, urls in port_details.items()} diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index dc0f0f338ed..5df05887f70 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -437,7 +437,7 @@ def url(self) -> Optional[str]: if not endpoint.startswith('http'): endpoint = 'http://' + endpoint return endpoint - except RuntimeError: + except (RuntimeError, exceptions.ClusterNotUpError): return None @property diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index edf6000cbc4..961802a0e56 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -735,7 +735,7 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: endpoint = backend_utils.get_endpoints( handle.cluster_name, load_balancer_port)[load_balancer_port] assert isinstance(endpoint, str) - except RuntimeError: + except (RuntimeError, exceptions.ClusterNotUpError): return '-' return endpoint From fcc63fef7dcebbaeaed9d3bfce795544dcbd9454 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Apr 2024 14:41:41 -0700 Subject: [PATCH 50/85] add to core.py --- sky/core.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sky/core.py b/sky/core.py index 2736c3d7c5f..5ee2e773e2f 100644 --- a/sky/core.py +++ b/sky/core.py @@ -111,6 +111,26 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, refresh=refresh, cluster_names=cluster_names) +def endpoints(cluster: str, + endpoint: Optional[Union[int, str]] = None) -> Dict[int, str]: + """Gets the endpoint for a given cluster and port number (endpoint). + + Args: + cluster: The name of the cluster. + endpoint: The port number to get the endpoint for. If None, ports for + all endpoints are returned. + + Returns: A dictionary of port numbers to endpoints. If endpoint is None, + the dictionary will contain all ports:endpoints exposed on the cluster. + + Raises: + ValueError: if the cluster is not UP or the endpoint is not exposed. + RuntimeError: if the cluster has no ports to be exposed or no endpoints + are exposed yet. + """ + return backend_utils.get_endpoints(cluster=cluster, + port=endpoint) + @usage_lib.entrypoint def cost_report() -> List[Dict[str, Any]]: From df21a0b402aeb58d2fc5eccc70374c63728db445 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Apr 2024 15:35:05 -0700 Subject: [PATCH 51/85] docstrs --- sky/cli.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sky/cli.py b/sky/cli.py index 5ae42984704..953b940c409 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2451,6 +2451,18 @@ def down( def _hint_or_raise_for_down_spot_controller(controller_name: str): + """ + Helper function to check spot controller status before tearing it down. + + Raises helpful exceptions and errors if the controller is not in a safe + state to be torn down. + + Raises: + RuntimeError: if failed to get the job queue. + exceptions.NotSupportedError: if the controller is not in a safe state + to be torn down (e.g., because it has jobs running or + it is in init state) + """ controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name @@ -2493,6 +2505,18 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): + """ + Helper function to check serve controller status before tearing it down. + + Raises helpful exceptions and errors if the controller is not in a safe + state to be torn down. + + Raises: + RuntimeError: if failed to get the service status. + exceptions.NotSupportedError: if the controller is not in a safe state + to be torn down (e.g., because it has services running or + it is in init state) + """ controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name with rich_utils.safe_status('[bold cyan]Checking for live services[/]'): From 5d8c9c7d99d1cac34fb783afd525c19b4514d8ce Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Apr 2024 15:45:35 -0700 Subject: [PATCH 52/85] add docs on exec based auth --- sky/provision/kubernetes/utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 706e63046a3..af730aa42d3 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -541,6 +541,18 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: """Checks if the kubeconfig file uses exec-based authentication + Exec-based auth is commonly used for authenticating with cloud hosted + Kubernetes services, such as GKE. Here is an example snippet from a + kubeconfig using exec-based authentication for a GKE cluster: + - name: mycluster + user: + exec: + apiVersion: client.authentication.k8s.io/v1beta1 + command: /Users/romilb/google-cloud-sdk/bin/gke-gcloud-auth-plugin + installHint: Install gke-gcloud-auth-plugin ... + provideClusterInfo: true + + Using exec-based authentication is problematic when used in conjunction with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml. This is because the exec-based authentication may not have the relevant From 937feb670120afbe9a4acb8caa7c4418757b919f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Apr 2024 15:50:54 -0700 Subject: [PATCH 53/85] expand elif --- sky/provision/kubernetes/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index af730aa42d3..6fa274fc6df 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -532,8 +532,10 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \ gpu_msg = str(e) if exec_msg and gpu_msg: return True, f'{gpu_msg}\n Additionally, {exec_msg}' - elif gpu_msg or exec_msg: - return True, gpu_msg or exec_msg + elif gpu_msg: + return True, gpu_msg + elif exec_msg: + return True, exec_msg else: return True, None From 33dfb0135486692d76d0599fe4bc7afe74e39e97 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Apr 2024 15:52:37 -0700 Subject: [PATCH 54/85] add lb comment --- sky/serve/controller.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 377358d3d2e..0d69bd9f11d 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -152,6 +152,9 @@ def configure_logger(): logger.info('SkyServe Controller started on ' f'http://localhost:{self._port}') + # We expose the controller to the public network to allow external + # load balancers (example, for high availability load balancers) to + # communicate with the controller. uvicorn.run(self._app, host='0.0.0.0', port=self._port) From 87c77df40db5df74e242a74a9449bdec83771b9d Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Apr 2024 15:46:30 -0700 Subject: [PATCH 55/85] refactor --- sky/cli.py | 4 ++-- sky/core.py | 9 +++++---- sky/serve/core.py | 3 ++- sky/serve/replica_managers.py | 3 ++- sky/serve/serve_utils.py | 7 ++++--- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 953b940c409..c7dcf00308a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1563,11 +1563,11 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = backend_utils.get_endpoints( + cluster_endpoint = core.endpoints( cluster_record['name'], endpoint)[endpoint] click.echo(cluster_endpoint) else: - cluster_endpoints = backend_utils.get_endpoints( + cluster_endpoints = core.endpoints( cluster_record['name']) assert isinstance(cluster_endpoints, dict) for port, port_endpoint in cluster_endpoints.items(): diff --git a/sky/core.py b/sky/core.py index 5ee2e773e2f..07d7caecaa4 100644 --- a/sky/core.py +++ b/sky/core.py @@ -111,14 +111,15 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, refresh=refresh, cluster_names=cluster_names) + def endpoints(cluster: str, - endpoint: Optional[Union[int, str]] = None) -> Dict[int, str]: + port: Optional[Union[int, str]] = None) -> Dict[int, str]: """Gets the endpoint for a given cluster and port number (endpoint). Args: cluster: The name of the cluster. - endpoint: The port number to get the endpoint for. If None, ports for - all endpoints are returned. + port: The port number to get the endpoint for. If None, endpoints + for all ports are returned.. Returns: A dictionary of port numbers to endpoints. If endpoint is None, the dictionary will contain all ports:endpoints exposed on the cluster. @@ -129,7 +130,7 @@ def endpoints(cluster: str, are exposed yet. """ return backend_utils.get_endpoints(cluster=cluster, - port=endpoint) + port=port) @usage_lib.entrypoint diff --git a/sky/serve/core.py b/sky/serve/core.py index fd4d2d61086..e4a3ec76095 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -8,6 +8,7 @@ import sky from sky import backends +from sky import core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -279,7 +280,7 @@ def up( else: lb_port = serve_utils.load_service_initialization_result( lb_port_payload) - endpoint = backend_utils.get_endpoints( + endpoint = core.endpoints( controller_handle.cluster_name, lb_port)[lb_port] sky_logging.print( diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 5df05887f70..5299d4ce599 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -17,6 +17,7 @@ import sky from sky import backends +from sky import core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -430,7 +431,7 @@ def url(self) -> Optional[str]: return None try: replica_port_int = int(self.replica_port) - endpoint = backend_utils.get_endpoints( + endpoint = core.get_endpoints( handle.cluster_name, replica_port_int)[replica_port_int] assert isinstance(endpoint, str) # If replica doesn't start with http or https, add http:// diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 961802a0e56..537c3d83f29 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -21,6 +21,7 @@ import requests from sky import backends +from sky import core from sky import exceptions from sky import global_user_state from sky import status_lib @@ -732,7 +733,7 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = backend_utils.get_endpoints( + endpoint = core.endpoints( handle.cluster_name, load_balancer_port)[load_balancer_port] assert isinstance(endpoint, str) except (RuntimeError, exceptions.ClusterNotUpError): @@ -819,7 +820,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], service_name = record['service_name'] replica_id = record['replica_id'] version = (record['version'] if 'version' in record else '-') - replica_ip = endpoint if endpoint else '-' + replica_endpoint = endpoint if endpoint else '-' launched_at = log_utils.readable_time_duration(record['launched_at']) resources_str = '-' replica_status = record['status'] @@ -840,7 +841,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], service_name, replica_id, version, - replica_ip, + replica_endpoint, launched_at, resources_str, status_str, From 99547dba78efca4c727e9a402315f820f2a13ebe Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Apr 2024 16:11:25 -0700 Subject: [PATCH 56/85] refactor --- sky/backends/backend_utils.py | 9 ++++++--- sky/cli.py | 13 +++++-------- sky/core.py | 3 +-- sky/provision/kubernetes/utils.py | 2 +- sky/serve/core.py | 3 +-- sky/serve/replica_managers.py | 4 ++-- sky/serve/serve_utils.py | 3 +-- 7 files changed, 17 insertions(+), 20 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 105e2782c26..a7c28479708 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2721,8 +2721,9 @@ def get_endpoints(cluster: str, cluster_record = cluster_records[0] if cluster_record['status'] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError(f'Cluster {cluster_record["name"]!r} ' - 'is not in UP status.') + raise exceptions.ClusterNotUpError( + f'Cluster {cluster_record["name"]!r} ' + 'is not in UP status.', cluster_record['status']) handle = cluster_record['handle'] if not isinstance(handle, backends.CloudVmRayResourceHandle): with ux_utils.print_exception_no_traceback(): @@ -2786,4 +2787,6 @@ def get_endpoints(cluster: str, kubernetes_utils.get_endpoint_debug_message() with ux_utils.print_exception_no_traceback(): raise RuntimeError(error_msg) - return {port_num: urls[0].url() for port_num, urls in port_details.items()} + return { + port_num: urls[0].url() for port_num, urls in port_details.items() + } diff --git a/sky/cli.py b/sky/cli.py index c7dcf00308a..2a99999a787 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1563,12 +1563,11 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = core.endpoints( - cluster_record['name'], endpoint)[endpoint] + cluster_endpoint = core.endpoints(cluster_record['name'], + endpoint)[endpoint] click.echo(cluster_endpoint) else: - cluster_endpoints = core.endpoints( - cluster_record['name']) + cluster_endpoints = core.endpoints(cluster_record['name']) assert isinstance(cluster_endpoints, dict) for port, port_endpoint in cluster_endpoints.items(): click.echo( @@ -2451,8 +2450,7 @@ def down( def _hint_or_raise_for_down_spot_controller(controller_name: str): - """ - Helper function to check spot controller status before tearing it down. + """Helper function to check spot controller status before tearing it down. Raises helpful exceptions and errors if the controller is not in a safe state to be torn down. @@ -2505,8 +2503,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): - """ - Helper function to check serve controller status before tearing it down. + """Helper function to check serve controller status before tearing it down. Raises helpful exceptions and errors if the controller is not in a safe state to be torn down. diff --git a/sky/core.py b/sky/core.py index 07d7caecaa4..29058d6a803 100644 --- a/sky/core.py +++ b/sky/core.py @@ -129,8 +129,7 @@ def endpoints(cluster: str, RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - return backend_utils.get_endpoints(cluster=cluster, - port=port) + return backend_utils.get_endpoints(cluster=cluster, port=port) @usage_lib.entrypoint diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6fa274fc6df..236ed708926 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -566,7 +566,7 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: mode is used for remote_identity in ~/.sky/config.yaml. str: Error message if exec-based authentication is used, None otherwise """ - k8s = kubernetes.get_kubernetes() + k8s = kubernetes.kubernetes try: k8s.config.load_kube_config() except kubernetes.config_exception(): diff --git a/sky/serve/core.py b/sky/serve/core.py index e4a3ec76095..fd4d2d61086 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -8,7 +8,6 @@ import sky from sky import backends -from sky import core from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -280,7 +279,7 @@ def up( else: lb_port = serve_utils.load_service_initialization_result( lb_port_payload) - endpoint = core.endpoints( + endpoint = backend_utils.get_endpoints( controller_handle.cluster_name, lb_port)[lb_port] sky_logging.print( diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 5299d4ce599..3c9bba2ee34 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -431,8 +431,8 @@ def url(self) -> Optional[str]: return None try: replica_port_int = int(self.replica_port) - endpoint = core.get_endpoints( - handle.cluster_name, replica_port_int)[replica_port_int] + endpoint = core.endpoints(handle.cluster_name, + replica_port_int)[replica_port_int] assert isinstance(endpoint, str) # If replica doesn't start with http or https, add http:// if not endpoint.startswith('http'): diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 537c3d83f29..9efc792b45c 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -21,7 +21,6 @@ import requests from sky import backends -from sky import core from sky import exceptions from sky import global_user_state from sky import status_lib @@ -733,7 +732,7 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = core.endpoints( + endpoint = backend_utils.get_endpoints( handle.cluster_name, load_balancer_port)[load_balancer_port] assert isinstance(endpoint, str) except (RuntimeError, exceptions.ClusterNotUpError): From 3d24b41c493f1a4b4df2e1dbea5fbcffa76acae7 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Apr 2024 16:22:00 -0700 Subject: [PATCH 57/85] fix docs build --- docs/source/cloud-setup/cloud-permissions/kubernetes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index be5254c633c..56760fc4dee 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -31,7 +31,7 @@ SkyPilot can operate using either of the following three authentication methods: remote_identity: SERVICE_ACCOUNT For details on the permissions that are granted to the service account, - refer to the `Permissions required by SkyPilot`_ section below. + refer to the `Permissions required for SkyPilot`_ section below. 3. **Using a custom service account**: If you have a custom service account with the `necessary permissions `__, you can configure From 750d4d4dbeaea470ceb8bd7b708fd82dccbb5e81 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 8 Apr 2024 14:27:41 -0700 Subject: [PATCH 58/85] add PODIP mode support --- sky/provision/kubernetes/network.py | 29 +++++++++++++++++++++++ sky/provision/kubernetes/network_utils.py | 9 +++++++ sky/provision/kubernetes/utils.py | 18 ++++++++++++++ sky/utils/kubernetes_enums.py | 1 + 4 files changed, 57 insertions(+) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index cb43c59fb7a..2729b36db9d 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -31,6 +31,9 @@ def open_ports( _open_ports_using_ingress(cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, provider_config=provider_config) + elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: + # Do nothing, as PodIP mode does not require opening ports + pass def _open_ports_using_loadbalancer( @@ -133,6 +136,9 @@ def cleanup_ports( _cleanup_ports_for_ingress(cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, provider_config=provider_config) + elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: + # Do nothing, as PodIP mode does not require opening ports + pass def _cleanup_ports_for_loadbalancer( @@ -193,6 +199,11 @@ def query_ports( cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, ) + elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: + return _query_ports_for_podip( + cluster_name_on_cloud=cluster_name_on_cloud, + ports=ports, + ) else: return {} except kubernetes.kubernetes.client.ApiException as e: @@ -248,3 +259,21 @@ def _query_ports_for_ingress( ] return result + + +def _query_ports_for_podip( + cluster_name_on_cloud: str, + ports: List[int], +) -> Dict[int, List[common.Endpoint]]: + namespace = kubernetes_utils.get_current_kube_config_context_namespace() + pod_name = kubernetes_utils.get_pod_name(cluster_name_on_cloud) + pod_ip = network_utils.get_pod_ip(namespace, pod_name) + + result: Dict[int, List[common.Endpoint]] = {} + if pod_ip is None: + return {} + + for port in ports: + result[port] = [common.SocketEndpoint(host=pod_ip, port=port)] + + return result diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 245c20bed2f..110db6b7ca2 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -234,3 +234,12 @@ def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: ip = service.status.load_balancer.ingress[ 0].ip or service.status.load_balancer.ingress[0].hostname return ip if ip is not None else None + + +def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]: + """Returns the IP address of the pod.""" + core_api = kubernetes.core_api() + pod = core_api.read_namespaced_pod( + pod_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) + + return pod.status.pod_ip if pod.status.pod_ip is not None else None \ No newline at end of file diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 236ed708926..79d8195facd 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1151,6 +1151,9 @@ def get_endpoint_debug_message() -> str: elif port_mode == kubernetes_enums.KubernetesPortMode.LOADBALANCER: endpoint_type = 'LoadBalancer' debug_cmd = 'kubectl describe service' + elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: + endpoint_type = 'PodIP' + debug_cmd = 'kubectl describe pod' return ENDPOINTS_DEBUG_MESSAGE.format(endpoint_type=endpoint_type, debug_cmd=debug_cmd) @@ -1322,3 +1325,18 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool: raise else: return True + + +def get_pod_name(cluster_name_on_cloud: str): + """Returns the pod name of the head pod for the given cluster name on cloud + + Args: + cluster_name_on_cloud: Name of the cluster on cloud + + Returns: + str: Pod name of the head pod + """ + # We could have iterated over all pods in the namespace and checked for the + # label, but since we know the naming convention, we can directly return the + # head pod name. + return f'{cluster_name_on_cloud}-head' diff --git a/sky/utils/kubernetes_enums.py b/sky/utils/kubernetes_enums.py index d8af2eb0821..a08e95b4a08 100644 --- a/sky/utils/kubernetes_enums.py +++ b/sky/utils/kubernetes_enums.py @@ -35,3 +35,4 @@ class KubernetesPortMode(enum.Enum): """ INGRESS = 'ingress' LOADBALANCER = 'loadbalancer' + PODIP = 'podip' From 87d4d25daff8471241eefb9349e18a0d8af1264b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 12 Apr 2024 10:03:54 -0700 Subject: [PATCH 59/85] make ssh services optional --- sky/clouds/kubernetes.py | 2 ++ sky/provision/kubernetes/network_utils.py | 17 +++++++++++++++++ sky/templates/kubernetes-ray.yml.j2 | 8 ++++++++ 3 files changed, 27 insertions(+) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 85e7777162c..1e0d1d4009e 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -262,6 +262,7 @@ def make_deploy_resources_variables( kubernetes_utils.get_gpu_label_key_value(acc_type) port_mode = network_utils.get_port_mode(None) + networking_mode = network_utils.get_networking_mode(None) remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) @@ -289,6 +290,7 @@ def make_deploy_resources_variables( 'k8s_namespace': kubernetes_utils.get_current_kube_config_context_namespace(), 'k8s_port_mode': port_mode.value, + 'k8s_networking_mode': networking_mode.value, 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME, 'k8s_acc_label_key': k8s_acc_label_key, 'k8s_acc_label_value': k8s_acc_label_value, diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 110db6b7ca2..e6402bc49eb 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -43,6 +43,23 @@ def get_port_mode( return port_mode +def get_networking_mode(mode_str: Optional[str] = None) -> kubernetes_enums.KubernetesNetworkingMode: + """Get the networking mode from the provider config.""" + mode_str = mode_str or skypilot_config.get_nested( + ('kubernetes', 'networking_mode'), + kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value) + try: + networking_mode = kubernetes_enums.KubernetesNetworkingMode(mode_str) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError(str(e) + + ' Cluster was setup with invalid networking mode.' + + 'Please check the networking_mode in provider config.') \ + from None + + return networking_mode + + def fill_loadbalancer_template(namespace: str, service_name: str, ports: List[int], selector_key: str, selector_value: str) -> Dict: diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index d0b8d72824b..6316e865744 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -155,8 +155,11 @@ provider: name: skypilot-service-account-cluster-role apiGroup: rbac.authorization.k8s.io + {% if k8s_networking_mode == "nodeport" or num_nodes > 1 %} services: + {% if k8s_networking_mode == "nodeport" %} # Service to expose the head node pod's SSH port. + # Required only when using nodeport for accessing ssh. - apiVersion: v1 kind: Service metadata: @@ -171,7 +174,10 @@ provider: - protocol: TCP port: 22 targetPort: 22 + {% endif %} + {% if num_nodes > 1 %} # Service that maps to the head node of the Ray cluster. + # Required only in multi-node settings. - apiVersion: v1 kind: Service metadata: @@ -195,6 +201,8 @@ provider: protocol: TCP port: 8265 targetPort: 8265 + {% endif %} + {% endif %} # Specify the pod type for the ray head node (as configured below). head_node_type: ray_head_default From a31a3bc52bbe57710889ad10d9fbfee68cb13c53 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 28 Apr 2024 22:09:18 -0700 Subject: [PATCH 60/85] nits --- docs/source/cloud-setup/cloud-permissions/kubernetes.rst | 2 +- sky/backends/backend_utils.py | 7 ++++--- sky/serve/replica_managers.py | 2 +- sky/utils/common_utils.py | 5 +++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst index 56760fc4dee..5318d76b1a3 100644 --- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst +++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst @@ -92,7 +92,7 @@ SkyPilot requires permissions equivalent to the following roles to be able to ma resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes. verbs: ["get", "list", "watch"] --- - # If using ingresses, role for accessing ingress service IP + # Optional: If using ingresses, role for accessing ingress service IP apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index a7c28479708..69ddcd2e16e 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2728,7 +2728,8 @@ def get_endpoints(cluster: str, if not isinstance(handle, backends.CloudVmRayResourceHandle): with ux_utils.print_exception_no_traceback(): raise ValueError('Querying IP address is not supported ' - 'for local clusters.') + f'for cluster {cluster!r} with backend ' + f'{get_backend_from_handle(handle).NAME}.') launched_resources = handle.launched_resources cloud = launched_resources.cloud @@ -2738,7 +2739,7 @@ def get_endpoints(cluster: str, except exceptions.NotSupportedError: with ux_utils.print_exception_no_traceback(): raise ValueError('Querying endpoints is not supported ' - f'for {cloud}.') from None + f'for cluster {cluster!r} on {cloud}.') from None config = common_utils.read_yaml(handle.cluster_yaml) port_details = provision_lib.query_ports(repr(cloud), @@ -2756,7 +2757,7 @@ def get_endpoints(cluster: str, with ux_utils.print_exception_no_traceback(): raise ValueError(f'Port {port} is not exposed ' 'on cluster ' - f'{cluster_record["name"]!r}.') + f'{cluster!r}.') # If the user requested a specific port endpoint, check if it is exposed if port not in port_details: error_msg = (f'Port {port} not exposed yet. ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 3c9bba2ee34..3878d5ffbfb 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -433,7 +433,7 @@ def url(self) -> Optional[str]: replica_port_int = int(self.replica_port) endpoint = core.endpoints(handle.cluster_name, replica_port_int)[replica_port_int] - assert isinstance(endpoint, str) + assert isinstance(endpoint, str), endpoint # If replica doesn't start with http or https, add http:// if not endpoint.startswith('http'): endpoint = 'http://' + endpoint diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index c4eab620f71..a5717f14749 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -70,8 +70,9 @@ def get_user_hash(force_fresh_hash: bool = False) -> str: Args: force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash - to be generated. - + to be generated. Used by `kubernetes.ssh_key_secret_field_name` to + avoid controllers sharing the same ssh key field name as the + local client. """ def _is_valid_user_hash(user_hash: Optional[str]) -> bool: From e3bb4d74a61ba94a4f66feb90efc59045c3913c2 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 28 Apr 2024 22:10:59 -0700 Subject: [PATCH 61/85] Revert "make ssh services optional" This reverts commit 87d4d25daff8471241eefb9349e18a0d8af1264b. --- sky/clouds/kubernetes.py | 2 -- sky/provision/kubernetes/network_utils.py | 17 ----------------- sky/templates/kubernetes-ray.yml.j2 | 8 -------- 3 files changed, 27 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 1e0d1d4009e..85e7777162c 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -262,7 +262,6 @@ def make_deploy_resources_variables( kubernetes_utils.get_gpu_label_key_value(acc_type) port_mode = network_utils.get_port_mode(None) - networking_mode = network_utils.get_networking_mode(None) remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) @@ -290,7 +289,6 @@ def make_deploy_resources_variables( 'k8s_namespace': kubernetes_utils.get_current_kube_config_context_namespace(), 'k8s_port_mode': port_mode.value, - 'k8s_networking_mode': networking_mode.value, 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME, 'k8s_acc_label_key': k8s_acc_label_key, 'k8s_acc_label_value': k8s_acc_label_value, diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index e6402bc49eb..110db6b7ca2 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -43,23 +43,6 @@ def get_port_mode( return port_mode -def get_networking_mode(mode_str: Optional[str] = None) -> kubernetes_enums.KubernetesNetworkingMode: - """Get the networking mode from the provider config.""" - mode_str = mode_str or skypilot_config.get_nested( - ('kubernetes', 'networking_mode'), - kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value) - try: - networking_mode = kubernetes_enums.KubernetesNetworkingMode(mode_str) - except ValueError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError(str(e) - + ' Cluster was setup with invalid networking mode.' - + 'Please check the networking_mode in provider config.') \ - from None - - return networking_mode - - def fill_loadbalancer_template(namespace: str, service_name: str, ports: List[int], selector_key: str, selector_value: str) -> Dict: diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 6316e865744..d0b8d72824b 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -155,11 +155,8 @@ provider: name: skypilot-service-account-cluster-role apiGroup: rbac.authorization.k8s.io - {% if k8s_networking_mode == "nodeport" or num_nodes > 1 %} services: - {% if k8s_networking_mode == "nodeport" %} # Service to expose the head node pod's SSH port. - # Required only when using nodeport for accessing ssh. - apiVersion: v1 kind: Service metadata: @@ -174,10 +171,7 @@ provider: - protocol: TCP port: 22 targetPort: 22 - {% endif %} - {% if num_nodes > 1 %} # Service that maps to the head node of the Ray cluster. - # Required only in multi-node settings. - apiVersion: v1 kind: Service metadata: @@ -201,8 +195,6 @@ provider: protocol: TCP port: 8265 targetPort: 8265 - {% endif %} - {% endif %} # Specify the pod type for the ray head node (as configured below). head_node_type: ray_head_default From 12096fd85ba3c0ba54ea2dc1a9e3ac23898de2c3 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 28 Apr 2024 22:10:59 -0700 Subject: [PATCH 62/85] Revert "add PODIP mode support" This reverts commit 750d4d4dbeaea470ceb8bd7b708fd82dccbb5e81. --- sky/provision/kubernetes/network.py | 29 ----------------------- sky/provision/kubernetes/network_utils.py | 9 ------- sky/provision/kubernetes/utils.py | 18 -------------- sky/utils/kubernetes_enums.py | 1 - 4 files changed, 57 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 2729b36db9d..cb43c59fb7a 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -31,9 +31,6 @@ def open_ports( _open_ports_using_ingress(cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, provider_config=provider_config) - elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: - # Do nothing, as PodIP mode does not require opening ports - pass def _open_ports_using_loadbalancer( @@ -136,9 +133,6 @@ def cleanup_ports( _cleanup_ports_for_ingress(cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, provider_config=provider_config) - elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: - # Do nothing, as PodIP mode does not require opening ports - pass def _cleanup_ports_for_loadbalancer( @@ -199,11 +193,6 @@ def query_ports( cluster_name_on_cloud=cluster_name_on_cloud, ports=ports, ) - elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: - return _query_ports_for_podip( - cluster_name_on_cloud=cluster_name_on_cloud, - ports=ports, - ) else: return {} except kubernetes.kubernetes.client.ApiException as e: @@ -259,21 +248,3 @@ def _query_ports_for_ingress( ] return result - - -def _query_ports_for_podip( - cluster_name_on_cloud: str, - ports: List[int], -) -> Dict[int, List[common.Endpoint]]: - namespace = kubernetes_utils.get_current_kube_config_context_namespace() - pod_name = kubernetes_utils.get_pod_name(cluster_name_on_cloud) - pod_ip = network_utils.get_pod_ip(namespace, pod_name) - - result: Dict[int, List[common.Endpoint]] = {} - if pod_ip is None: - return {} - - for port in ports: - result[port] = [common.SocketEndpoint(host=pod_ip, port=port)] - - return result diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 110db6b7ca2..245c20bed2f 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -234,12 +234,3 @@ def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: ip = service.status.load_balancer.ingress[ 0].ip or service.status.load_balancer.ingress[0].hostname return ip if ip is not None else None - - -def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]: - """Returns the IP address of the pod.""" - core_api = kubernetes.core_api() - pod = core_api.read_namespaced_pod( - pod_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - - return pod.status.pod_ip if pod.status.pod_ip is not None else None \ No newline at end of file diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 79d8195facd..236ed708926 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1151,9 +1151,6 @@ def get_endpoint_debug_message() -> str: elif port_mode == kubernetes_enums.KubernetesPortMode.LOADBALANCER: endpoint_type = 'LoadBalancer' debug_cmd = 'kubectl describe service' - elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP: - endpoint_type = 'PodIP' - debug_cmd = 'kubectl describe pod' return ENDPOINTS_DEBUG_MESSAGE.format(endpoint_type=endpoint_type, debug_cmd=debug_cmd) @@ -1325,18 +1322,3 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool: raise else: return True - - -def get_pod_name(cluster_name_on_cloud: str): - """Returns the pod name of the head pod for the given cluster name on cloud - - Args: - cluster_name_on_cloud: Name of the cluster on cloud - - Returns: - str: Pod name of the head pod - """ - # We could have iterated over all pods in the namespace and checked for the - # label, but since we know the naming convention, we can directly return the - # head pod name. - return f'{cluster_name_on_cloud}-head' diff --git a/sky/utils/kubernetes_enums.py b/sky/utils/kubernetes_enums.py index a08e95b4a08..d8af2eb0821 100644 --- a/sky/utils/kubernetes_enums.py +++ b/sky/utils/kubernetes_enums.py @@ -35,4 +35,3 @@ class KubernetesPortMode(enum.Enum): """ INGRESS = 'ingress' LOADBALANCER = 'loadbalancer' - PODIP = 'podip' From c33572e75dba384d7b384acfbd64a9317d33d5a3 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 28 Apr 2024 23:01:50 -0700 Subject: [PATCH 63/85] nits --- sky/provision/__init__.py | 2 +- sky/serve/serve_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 58a05f04550..7981a1f4545 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -148,7 +148,7 @@ def query_ports( If head_ip is provided, it may be used by the cloud implementation to return the endpoint without querying the cloud provider. If head_ip is not - provider, the cloud provider will be queried to get the endpoint info. + provided, the cloud provider will be queried to get the endpoint info. Returns a dict with port as the key and a list of common.Endpoint. """ diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 9efc792b45c..62001c44f4f 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -815,7 +815,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], replica_records = replica_records[:_REPLICA_TRUNC_NUM] for record in replica_records: - endpoint = record.get('endpoint', None) + endpoint = record.get('endpoint', '-') service_name = record['service_name'] replica_id = record['replica_id'] version = (record['version'] if 'version' in record else '-') From 803df4ab7dd95f9e7f3834199071c501e6207052 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 11:13:56 -0700 Subject: [PATCH 64/85] use 0.0.0.0 when on k8s; use common impl for other clouds --- sky/provision/__init__.py | 11 ++++++++--- sky/provision/aws/__init__.py | 1 - sky/provision/aws/instance.py | 11 ----------- sky/provision/azure/__init__.py | 1 - sky/provision/azure/instance.py | 11 ----------- sky/provision/gcp/__init__.py | 1 - sky/provision/gcp/instance.py | 11 ----------- sky/serve/controller.py | 23 ++++++++++++++++------- 8 files changed, 24 insertions(+), 46 deletions(-) diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 7981a1f4545..2f9a5bda44c 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -41,8 +41,12 @@ def _wrapper(*args, **kwargs): module = globals().get(module_name) assert module is not None, f'Unknown provider: {module_name}' - impl = getattr(module, func.__name__) - return impl(*args, **kwargs) + impl = getattr(module, func.__name__, None) + if impl: + return impl(*args, **kwargs) + + # If implementation does not exist, fall back to default implementation + return func(provider_name, *args, **kwargs) return _wrapper @@ -152,7 +156,8 @@ def query_ports( Returns a dict with port as the key and a list of common.Endpoint. """ - raise NotImplementedError + del provider_name, provider_config, cluster_name_on_cloud # unused + return common.query_ports_passthrough(ports, head_ip) @_route_to_cloud_impl diff --git a/sky/provision/aws/__init__.py b/sky/provision/aws/__init__.py index bcbe646f219..e569d3b042e 100644 --- a/sky/provision/aws/__init__.py +++ b/sky/provision/aws/__init__.py @@ -5,7 +5,6 @@ from sky.provision.aws.instance import get_cluster_info from sky.provision.aws.instance import open_ports from sky.provision.aws.instance import query_instances -from sky.provision.aws.instance import query_ports from sky.provision.aws.instance import run_instances from sky.provision.aws.instance import stop_instances from sky.provision.aws.instance import terminate_instances diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index abdf67f4a82..a7b53593ea4 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -876,14 +876,3 @@ def get_cluster_info( instances=instances, head_instance_id=head_instance_id, ) - - -def query_ports( - cluster_name_on_cloud: str, - ports: List[str], - head_ip: Optional[str] = None, - provider_config: Optional[Dict[str, Any]] = None, -) -> Dict[int, List[common.Endpoint]]: - """See sky/provision/__init__.py""" - del provider_config, cluster_name_on_cloud # unused - return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/provision/azure/__init__.py b/sky/provision/azure/__init__.py index 9c87fc907db..b83dbb462d9 100644 --- a/sky/provision/azure/__init__.py +++ b/sky/provision/azure/__init__.py @@ -2,4 +2,3 @@ from sky.provision.azure.instance import cleanup_ports from sky.provision.azure.instance import open_ports -from sky.provision.azure.instance import query_ports diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index 6e08e1863d3..b68b692fe67 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -94,14 +94,3 @@ def cleanup_ports( # Azure will automatically cleanup network security groups when cleanup # resource group. So we don't need to do anything here. del cluster_name_on_cloud, ports, provider_config # Unused. - - -def query_ports( - cluster_name_on_cloud: str, - ports: List[str], - head_ip: Optional[str] = None, - provider_config: Optional[Dict[str, Any]] = None, -) -> Dict[int, List[common.Endpoint]]: - """See sky/provision/__init__.py""" - del provider_config, cluster_name_on_cloud # unused - return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/provision/gcp/__init__.py b/sky/provision/gcp/__init__.py index fdadd5345e2..0d24a577690 100644 --- a/sky/provision/gcp/__init__.py +++ b/sky/provision/gcp/__init__.py @@ -5,7 +5,6 @@ from sky.provision.gcp.instance import get_cluster_info from sky.provision.gcp.instance import open_ports from sky.provision.gcp.instance import query_instances -from sky.provision.gcp.instance import query_ports from sky.provision.gcp.instance import run_instances from sky.provision.gcp.instance import stop_instances from sky.provision.gcp.instance import terminate_instances diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py index 95d95209787..35c8ae44dc8 100644 --- a/sky/provision/gcp/instance.py +++ b/sky/provision/gcp/instance.py @@ -615,14 +615,3 @@ def cleanup_ports( firewall_rule_name = provider_config['firewall_rule'] instance_utils.GCPComputeInstance.delete_firewall_rule( project_id, firewall_rule_name) - - -def query_ports( - cluster_name_on_cloud: str, - ports: List[str], - head_ip: Optional[str] = None, - provider_config: Optional[Dict[str, Any]] = None, -) -> Dict[int, List[common.Endpoint]]: - """See sky/provision/__init__.py""" - del provider_config, cluster_name_on_cloud # unused - return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 0d69bd9f11d..7db6cd0f791 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -3,6 +3,7 @@ Responsible for autoscaling and replica management. """ import logging +import os import threading import time import traceback @@ -39,7 +40,7 @@ class SkyServeController: """ def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, - task_yaml: str, port: int) -> None: + task_yaml: str, host: str, port: int) -> None: self._service_name = service_name self._replica_manager: replica_managers.ReplicaManager = ( replica_managers.SkyPilotReplicaManager(service_name=service_name, @@ -47,6 +48,7 @@ def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, task_yaml_path=task_yaml)) self._autoscaler: autoscalers.Autoscaler = ( autoscalers.Autoscaler.from_spec(service_name, service_spec)) + self._host = host self._port = port self._app = fastapi.FastAPI() @@ -150,18 +152,25 @@ def configure_logger(): threading.Thread(target=self._run_autoscaler).start() logger.info('SkyServe Controller started on ' - f'http://localhost:{self._port}') + f'http://{self._host}:{self._port}') - # We expose the controller to the public network to allow external - # load balancers (example, for high availability load balancers) to - # communicate with the controller. - uvicorn.run(self._app, host='0.0.0.0', port=self._port) + uvicorn.run(self._app, host={host}, port=self._port) # TODO(tian): Probably we should support service that will stop the VM in # specific time period. def run_controller(service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, controller_port: int): + # We expose the controller to the public network when running inside a + # kubernetes cluster to allow external load balancers (example, for + # high availability load balancers) to communicate with the controller. + def _get_host(): + if "KUBERNETES_SERVICE_HOST" in os.environ: + return '0.0.0.0' + else: + return 'localhost' + + host = _get_host() controller = SkyServeController(service_name, service_spec, task_yaml, - controller_port) + host, controller_port) controller.run() From a0ba0d17f91ad465dbf1bc43e87e2ecde92b5ba4 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 12:53:52 -0700 Subject: [PATCH 65/85] return dict instead of raising errors in core.endpoints() --- sky/backends/backend_utils.py | 30 ++++++++++++++++-------------- sky/cli.py | 11 ++++++++++- sky/serve/controller.py | 2 +- sky/serve/replica_managers.py | 21 ++++++++++++--------- sky/serve/serve_utils.py | 9 ++++++--- 5 files changed, 45 insertions(+), 28 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 69ddcd2e16e..1620f6d2be8 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2702,11 +2702,14 @@ def get_endpoints(cluster: str, Returns: A dictionary of port numbers to endpoints. If endpoint is None, the dictionary will contain all ports:endpoints exposed on the cluster. + If the endpoint is not exposed yet (e.g., during cluster launch or + waiting for cloud provider to expose the endpoint), an empty dictionary + is returned. Raises: - ValueError: if the cluster is not UP or the endpoint is not exposed. - RuntimeError: if the cluster has no ports to be exposed or no endpoints - are exposed yet. + ValueError: if the port is invalid or the cloud provider does not + support querying endpoints. + exceptions.ClusterNotUpError: if the cluster is not in UP status. """ # Cast endpoint to int if it is not None if port is not None: @@ -2754,10 +2757,9 @@ def get_endpoints(cluster: str, port_set = resources_utils.port_ranges_to_set( handle.launched_resources.ports) if port not in port_set: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Port {port} is not exposed ' - 'on cluster ' - f'{cluster!r}.') + logger.warning(f'Port {port} is not exposed on ' + f'cluster {cluster!r}.') + return {} # If the user requested a specific port endpoint, check if it is exposed if port not in port_details: error_msg = (f'Port {port} not exposed yet. ' @@ -2766,16 +2768,16 @@ def get_endpoints(cluster: str, clouds.Kubernetes()): # Add Kubernetes specific debugging info error_msg += (kubernetes_utils.get_endpoint_debug_message()) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) + logger.warning(error_msg) + return {} return {port: port_details[port][0].url()} else: if not port_details: # If cluster had no ports to be exposed if handle.launched_resources.ports is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Cluster does not have any ports ' - 'to be exposed.') + logger.warning(f'Cluster {cluster!r} does not have any ' + 'ports to be exposed.') + return {} # Else ports have not been exposed even though they exist. # In this case, ask the user to retry. else: @@ -2786,8 +2788,8 @@ def get_endpoints(cluster: str, # Add Kubernetes specific debugging info error_msg += \ kubernetes_utils.get_endpoint_debug_message() - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(error_msg) + logger.warning(error_msg) + return {} return { port_num: urls[0].url() for port_num, urls in port_details.items() } diff --git a/sky/cli.py b/sky/cli.py index 2a99999a787..16f1c56b148 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1564,11 +1564,20 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, if show_endpoints: if endpoint: cluster_endpoint = core.endpoints(cluster_record['name'], - endpoint)[endpoint] + endpoint).get(endpoint, + None) + if not cluster_endpoint: + raise click.Abort( + f'Endpoint {endpoint} not found for cluster ' + f'{cluster_record["name"]!r}.') click.echo(cluster_endpoint) else: cluster_endpoints = core.endpoints(cluster_record['name']) assert isinstance(cluster_endpoints, dict) + if not cluster_endpoints: + raise click.Abort( + f'No endpoint found for cluster ' + f'{cluster_record["name"]!r}.') for port, port_endpoint in cluster_endpoints.items(): click.echo( f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 7db6cd0f791..d788c241b61 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -154,7 +154,7 @@ def configure_logger(): logger.info('SkyServe Controller started on ' f'http://{self._host}:{self._port}') - uvicorn.run(self._app, host={host}, port=self._port) + uvicorn.run(self._app, host={self._host}, port=self._port) # TODO(tian): Probably we should support service that will stop the VM in diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 3878d5ffbfb..efb3ba3cf48 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -429,17 +429,20 @@ def url(self) -> Optional[str]: handle = self.handle() if handle is None: return None + replica_port_int = int(self.replica_port) try: - replica_port_int = int(self.replica_port) - endpoint = core.endpoints(handle.cluster_name, - replica_port_int)[replica_port_int] - assert isinstance(endpoint, str), endpoint - # If replica doesn't start with http or https, add http:// - if not endpoint.startswith('http'): - endpoint = 'http://' + endpoint - return endpoint - except (RuntimeError, exceptions.ClusterNotUpError): + endpoint_dict = core.endpoints(handle.cluster_name, + replica_port_int) + except exceptions.ClusterNotUpError: return None + endpoint = endpoint_dict.get(replica_port_int, None) + if not endpoint: + return None + assert isinstance(endpoint, str), endpoint + # If replica doesn't start with http or https, add http:// + if not endpoint.startswith('http'): + endpoint = 'http://' + endpoint + return endpoint @property def status(self) -> serve_state.ReplicaStatus: diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 62001c44f4f..77be8ed4d48 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -733,10 +733,13 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: return '-' try: endpoint = backend_utils.get_endpoints( - handle.cluster_name, load_balancer_port)[load_balancer_port] - assert isinstance(endpoint, str) - except (RuntimeError, exceptions.ClusterNotUpError): + handle.cluster_name, load_balancer_port).get(load_balancer_port, + None) + except exceptions.ClusterNotUpError: return '-' + if endpoint is None: + return '-' + assert isinstance(endpoint, str), endpoint return endpoint From fb882187ede3d8d6d5ec08f365490c277cdeb6de Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 12:56:53 -0700 Subject: [PATCH 66/85] lint --- sky/cli.py | 9 ++++----- sky/provision/azure/instance.py | 1 - sky/serve/controller.py | 6 +++--- sky/serve/serve_utils.py | 6 +++--- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 16f1c56b148..a324955ef44 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1564,8 +1564,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, if show_endpoints: if endpoint: cluster_endpoint = core.endpoints(cluster_record['name'], - endpoint).get(endpoint, - None) + endpoint).get( + endpoint, None) if not cluster_endpoint: raise click.Abort( f'Endpoint {endpoint} not found for cluster ' @@ -1575,9 +1575,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, cluster_endpoints = core.endpoints(cluster_record['name']) assert isinstance(cluster_endpoints, dict) if not cluster_endpoints: - raise click.Abort( - f'No endpoint found for cluster ' - f'{cluster_record["name"]!r}.') + raise click.Abort(f'No endpoint found for cluster ' + f'{cluster_record["name"]!r}.') for port, port_endpoint in cluster_endpoints.items(): click.echo( f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index b68b692fe67..2c0f7e93f93 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -4,7 +4,6 @@ from sky import sky_logging from sky.adaptors import azure -from sky.provision import common from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index d788c241b61..8d7964f090b 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -165,12 +165,12 @@ def run_controller(service_name: str, service_spec: serve.SkyServiceSpec, # kubernetes cluster to allow external load balancers (example, for # high availability load balancers) to communicate with the controller. def _get_host(): - if "KUBERNETES_SERVICE_HOST" in os.environ: + if 'KUBERNETES_SERVICE_HOST' in os.environ: return '0.0.0.0' else: return 'localhost' host = _get_host() - controller = SkyServeController(service_name, service_spec, task_yaml, - host, controller_port) + controller = SkyServeController(service_name, service_spec, task_yaml, host, + controller_port) controller.run() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 77be8ed4d48..8a4387b40c0 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -732,9 +732,9 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: if load_balancer_port is None: return '-' try: - endpoint = backend_utils.get_endpoints( - handle.cluster_name, load_balancer_port).get(load_balancer_port, - None) + endpoint = backend_utils.get_endpoints(handle.cluster_name, + load_balancer_port).get( + load_balancer_port, None) except exceptions.ClusterNotUpError: return '-' if endpoint is None: From e22908aaa67d35e9a09ead459396b79637787f33 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 13:10:24 -0700 Subject: [PATCH 67/85] merge fixes --- sky/utils/controller_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index a7cfa9712ec..fa4275353bd 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -166,26 +166,26 @@ def _get_cloud_dependencies_installation_commands( # TODO(tian): Make dependency installation command a method of cloud # class and get all installation command for enabled clouds. # AWS - if clouds.cloud_in_list(clouds.AWS(), enabled_clouds): + if clouds.cloud_in_iterable(clouds.AWS(), enabled_clouds): commands.append( 'pip list | grep boto3 > /dev/null 2>&1 || ' 'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 ' 'boto3>=1.26.1 > /dev/null 2>&1') # GCP - if clouds.cloud_in_list(clouds.GCP(), enabled_clouds): + if clouds.cloud_in_iterable(clouds.GCP(), enabled_clouds): commands.extend([ 'pip list | grep google-api-python-client > /dev/null 2>&1 || ' 'pip install google-api-python-client>=2.69.0 google-cloud-storage ' '> /dev/null 2>&1', f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}' ]) # Azure - if clouds.cloud_in_list(clouds.Azure(), enabled_clouds): + if clouds.cloud_in_iterable(clouds.Azure(), enabled_clouds): commands.append( 'pip list | grep azure-cli > /dev/null 2>&1 || ' 'pip install azure-cli>=2.31.0 azure-core azure-identity>=1.13.0 ' 'azure-mgmt-network > /dev/null 2>&1') # Kubernetes - if clouds.cloud_in_list(clouds.Kubernetes(), enabled_clouds): + if clouds.cloud_in_iterable(clouds.Kubernetes(), enabled_clouds): commands.append( # Install k8s + skypilot dependencies 'sudo bash -c "if ' From 34a719c44a3d00e7c7c72037d9eb0be84eca45cf Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 13:16:23 -0700 Subject: [PATCH 68/85] merge fixes --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index fa4275353bd..53a6fe7b09d 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -204,7 +204,7 @@ def _get_cloud_dependencies_installation_commands( if controller_type == 'spot': # oci doesn't support open port yet, so we don't install oci # dependencies for sky serve controller. - if clouds.cloud_in_list(clouds.OCI(), enabled_clouds): + if clouds.cloud_in_iterable(clouds.OCI(), enabled_clouds): commands.append('pip list | grep oci > /dev/null 2>&1 || ' 'pip install oci > /dev/null 2>&1') # ibm doesn't support open port and spot instance yet, so we don't From cd08d0f16a893b282226b61d69ac2ee083a20e79 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 13:25:39 -0700 Subject: [PATCH 69/85] merge fixes --- sky/provision/kubernetes/utils.py | 2 +- sky/utils/controller_utils.py | 36 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 5ee6ed340b4..6108dffd46c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -640,7 +640,7 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: ctx_name = current_context['name'] exec_msg = ('exec-based authentication is used for ' f'Kubernetes context {ctx_name!r}.' - ' This may cause issues when running Managed Spot ' + ' This may cause issues when running Managed Jobs ' 'or SkyServe controller on Kubernetes. To fix, configure ' 'SkyPilot to create a service account for running pods by ' 'adding the following in ~/.sky/config.yaml:\n' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 9a2ceadacd1..ae14dee4107 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -228,6 +228,24 @@ def _get_cloud_dependencies_installation_commands( 'pip list | grep google-cloud-storage > /dev/null 2>&1 || ' 'pip install google-cloud-storage > /dev/null 2>&1') commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}') + elif isinstance(cloud, clouds.Kubernetes): + commands.append( + f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && ' + 'pip list | grep kubernetes > /dev/null 2>&1 || ' + 'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&' + # Install k8s + skypilot dependencies + 'sudo bash -c "if ' + '! command -v curl &> /dev/null || ' + '! command -v socat &> /dev/null || ' + '! command -v netcat &> /dev/null; ' + 'then apt update && apt install curl socat netcat -y; ' + 'fi" && ' + # Install kubectl + '(command -v kubectl &>/dev/null || ' + '(curl -LO "https://dl.k8s.io/release/$(curl -L -s ' + 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' + 'sudo install -o root -g root -m 0755 ' + 'kubectl /usr/local/bin/kubectl))') if controller == Controllers.JOBS_CONTROLLER: if isinstance(cloud, clouds.IBM): commands.append( @@ -239,24 +257,6 @@ def _get_cloud_dependencies_installation_commands( commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && ' 'pip list | grep oci > /dev/null 2>&1 || ' 'pip install oci > /dev/null 2>&1') - elif isinstance(cloud, clouds.Kubernetes): - commands.append( - f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && ' - 'pip list | grep kubernetes > /dev/null 2>&1 || ' - 'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&' - # Install k8s + skypilot dependencies - 'sudo bash -c "if ' - '! command -v curl &> /dev/null || ' - '! command -v socat &> /dev/null || ' - '! command -v netcat &> /dev/null; ' - 'then apt update && apt install curl socat netcat -y; ' - 'fi" && ' - # Install kubectl - '(command -v kubectl &>/dev/null || ' - '(curl -LO "https://dl.k8s.io/release/$(curl -L -s ' - 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' - 'sudo install -o root -g root -m 0755 ' - 'kubectl /usr/local/bin/kubectl)) && ') elif isinstance(cloud, clouds.RunPod): commands.append( f'echo -en "\\r{prefix_str}RunPod{empty_str}" && ' From 05f1996fa4e7c7fafa58d9ab7534f0bb55cf588f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 13:27:50 -0700 Subject: [PATCH 70/85] lint --- sky/cli.py | 1 - sky/utils/controller_utils.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 88134de590c..2e863f2eef7 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -52,7 +52,6 @@ from sky import exceptions from sky import global_user_state from sky import jobs as managed_jobs -from sky import provision as provision_lib from sky import serve as serve_lib from sky import sky_logging from sky import status_lib diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index ae14dee4107..4087d2d984c 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -242,8 +242,9 @@ def _get_cloud_dependencies_installation_commands( 'fi" && ' # Install kubectl '(command -v kubectl &>/dev/null || ' - '(curl -LO "https://dl.k8s.io/release/$(curl -L -s ' - 'https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && ' + '(curl -LO "https://dl.k8s.io/release/' + '$(curl -L -s https://dl.k8s.io/release/stable.txt)' + '/bin/linux/amd64/kubectl" && ' 'sudo install -o root -g root -m 0755 ' 'kubectl /usr/local/bin/kubectl))') if controller == Controllers.JOBS_CONTROLLER: From 97b1d56e36090cc6b0f41ff995c64c096259bc97 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 13:44:28 -0700 Subject: [PATCH 71/85] fix smoke tests --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index c4053c18b95..9a819a7dc25 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3244,7 +3244,7 @@ def test_skyserve_azure_http(): @pytest.mark.kubernetes -@pytest.mark.sky_serve +@pytest.mark.serve def test_skyserve_kubernetes_http(): """Test skyserve on Kubernetes""" name = _get_service_name() From 9a8ccd1d8b4aab6890b10c7e71227498c8cabb00 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 15:02:53 -0700 Subject: [PATCH 72/85] fix smoke tests --- tests/test_yamls/minimal.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_yamls/minimal.yaml b/tests/test_yamls/minimal.yaml index d7f9d4482f2..33210f829bf 100644 --- a/tests/test_yamls/minimal.yaml +++ b/tests/test_yamls/minimal.yaml @@ -2,6 +2,7 @@ name: min setup: | echo "running setup" + sudo apt-get install -y jq run: | conda env list From 3b7d33f948de5681d034febe0c235fa375ac3517 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 15:02:57 -0700 Subject: [PATCH 73/85] comment --- sky/utils/common_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index cc7afc424a9..b5b9027de65 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -106,7 +106,9 @@ def _is_valid_user_hash(user_hash: Optional[str]) -> bool: os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True) if not force_fresh_hash: # Do not cache to file if force_fresh_hash is True since the file may - # be intentionally using a different hash. + # be intentionally using a different hash, e.g. we want to keep the + # user_hash for usage collection the same on the jobs/serve controller + # as users' local client. with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f: f.write(user_hash) return user_hash From e029ebceade3d63de9dd5b147e210a732ce00cd7 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 15:35:54 -0700 Subject: [PATCH 74/85] add enum for remote identity --- sky/backends/backend_utils.py | 2 +- sky/clouds/kubernetes.py | 4 ++-- sky/provision/kubernetes/utils.py | 2 +- sky/utils/schemas.py | 22 +++++++++++++++++++--- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1d8630ea1a7..f7a8a29e981 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -810,7 +810,7 @@ def write_cluster_config( if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]): remote_identity = list(profile.values())[0] break - if remote_identity != 'LOCAL_CREDENTIALS': + if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value: if not cloud.supports_service_account_on_remote(): raise exceptions.InvalidCloudConfigs( 'remote_identity: SERVICE_ACCOUNT is specified in ' diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 8c6c559c041..eda0c5f6baa 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -271,11 +271,11 @@ def make_deploy_resources_variables( remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) - if remote_identity == 'LOCAL_CREDENTIALS': + if remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value: # SA name doesn't matter since automounting credentials is disabled k8s_service_account_name = 'default' k8s_automount_sa_token = 'false' - elif remote_identity == 'SERVICE_ACCOUNT': + elif remote_identity == schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value: # Use the default service account k8s_service_account_name = self.SKY_DEFAULT_SERVICE_ACCOUNT_NAME k8s_automount_sa_token = 'true' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6108dffd46c..c0223e17cdd 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -636,7 +636,7 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) if ('exec' in user_details.get('user', {}) and - remote_identity == 'LOCAL_CREDENTIALS'): + remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): ctx_name = current_context['name'] exec_msg = ('exec-based authentication is used for ' f'Kubernetes context {ctx_name!r}.' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index eb86677af13..1caa090be2b 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -3,6 +3,7 @@ Schemas conform to the JSON Schema specification as defined at https://json-schema.org/ """ +import enum def _check_not_both_fields_present(field1: str, field2: str): @@ -522,10 +523,27 @@ def get_cluster_schema(): } } + +class RemoteIdentityOptions(enum.Enum): + """Enum for remote identity types. + + Some clouds (e.g., AWS, Kubernetes) also allow string values for remote + identity, which map to the service account/role to use. Those are not + included in this enum. + """ + LOCAL_CREDENTIALS = 'LOCAL_CREDENTIALS' + SERVICE_ACCOUNT = 'SERVICE_ACCOUNT' + + +REMOTE_IDENTITY_DEFAULT = RemoteIdentityOptions.LOCAL_CREDENTIALS.value + + _REMOTE_IDENTITY_SCHEMA = { 'remote_identity': { 'type': 'string', - 'case_insensitive_enum': ['LOCAL_CREDENTIALS', 'SERVICE_ACCOUNT'] + 'case_insensitive_enum': [ + option.value for option in RemoteIdentityOptions + ] } } @@ -562,8 +580,6 @@ def get_cluster_schema(): }, } -REMOTE_IDENTITY_DEFAULT = 'LOCAL_CREDENTIALS' - def get_config_schema(): # pylint: disable=import-outside-toplevel From 7b201f290ccffbbfc7c9f55b355f80646b3e6047 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 15:41:53 -0700 Subject: [PATCH 75/85] lint --- sky/clouds/kubernetes.py | 6 ++++-- sky/provision/kubernetes/utils.py | 4 ++-- sky/utils/schemas.py | 5 ++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index eda0c5f6baa..5740e0ed9b1 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -271,11 +271,13 @@ def make_deploy_resources_variables( remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) - if remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value: + if (remote_identity == + schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): # SA name doesn't matter since automounting credentials is disabled k8s_service_account_name = 'default' k8s_automount_sa_token = 'false' - elif remote_identity == schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value: + elif (remote_identity == + schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value): # Use the default service account k8s_service_account_name = self.SKY_DEFAULT_SERVICE_ACCOUNT_NAME k8s_automount_sa_token = 'true' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c0223e17cdd..4c26c0c2199 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -635,8 +635,8 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: remote_identity = skypilot_config.get_nested( ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) - if ('exec' in user_details.get('user', {}) and - remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): + if ('exec' in user_details.get('user', {}) and remote_identity + == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): ctx_name = current_context['name'] exec_msg = ('exec-based authentication is used for ' f'Kubernetes context {ctx_name!r}.' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 1caa090be2b..518d5edf07a 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -537,13 +537,12 @@ class RemoteIdentityOptions(enum.Enum): REMOTE_IDENTITY_DEFAULT = RemoteIdentityOptions.LOCAL_CREDENTIALS.value - _REMOTE_IDENTITY_SCHEMA = { 'remote_identity': { 'type': 'string', 'case_insensitive_enum': [ - option.value for option in RemoteIdentityOptions - ] + option.value for option in RemoteIdentityOptions + ] } } From f31553b4a461a6a074f5cc0e3cad358dc686f150 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 17:49:35 -0700 Subject: [PATCH 76/85] disable autostop for kubernetes --- sky/backends/cloud_vm_ray_backend.py | 15 ++++++++++++++- sky/clouds/kubernetes.py | 7 ------- sky/utils/controller_utils.py | 16 +++++++++++++++- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4d0fdb8d68b..f660032aca4 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1991,9 +1991,17 @@ def provision_with_retries( cloud_user = None else: cloud_user = to_provision.cloud.get_current_user_identity() + + requested_features = self._requested_features.copy() + # Skip stop feature for Kubernetes jobs controller. + if isinstance(to_provision.cloud, clouds.Kubernetes) and controller_utils.Controllers.from_name( + cluster_name) == controller_utils.Controllers.JOBS_CONTROLLER: + requested_features.remove( + clouds.CloudImplementationFeatures.STOP) + # Skip if to_provision.cloud does not support requested features to_provision.cloud.check_features_are_supported( - to_provision, self._requested_features) + to_provision, requested_features) config_dict = self._retry_zones( to_provision, @@ -4045,6 +4053,11 @@ def set_autostop(self, # The core.autostop() function should have already checked that the # cloud and resources support requested autostop. if idle_minutes_to_autostop is not None: + # Skip auto-stop for Kubernetes clusters. + if isinstance(handle.launched_resources.cloud, clouds.Kubernetes): + logger.info('Auto-stop is not supported for Kubernetes ' + 'clusters. Skipping.') + return # Check if we're stopping spot assert (handle.launched_resources is not None and diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index be9111feac5..ca5f3f9c803 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -71,13 +71,6 @@ class Kubernetes(clouds.Cloud): 'tiers are not ' 'supported in ' 'Kubernetes.', - # Kubernetes may be using exec-based auth, which may not work by - # directly copying the kubeconfig file to the controller. - # Support for service accounts for auth will be added in #3377, which - # will allow us to support hosting controllers. - clouds.CloudImplementationFeatures.HOST_CONTROLLERS: 'Kubernetes can ' - 'not host ' - 'controllers.', } IMAGE_CPU = 'skypilot:cpu-ubuntu-2004' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index b4a312ac1ab..27c0e77af62 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -243,7 +243,21 @@ def _get_cloud_dependencies_installation_commands( commands.append( f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && ' 'pip list | grep kubernetes > /dev/null 2>&1 || ' - 'pip install "kubernetes>=20.0.0" > /dev/null 2>&1') + 'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&' + # Install k8s + skypilot dependencies + 'sudo bash -c "if ' + '! command -v curl &> /dev/null || ' + '! command -v socat &> /dev/null || ' + '! command -v netcat &> /dev/null; ' + 'then apt update && apt install curl socat netcat -y; ' + 'fi" && ' + # Install kubectl + '(command -v kubectl &>/dev/null || ' + '(curl -LO "https://dl.k8s.io/release/' + '$(curl -L -s https://dl.k8s.io/release/stable.txt)' + '/bin/linux/amd64/kubectl" && ' + 'sudo install -o root -g root -m 0755 ' + 'kubectl /usr/local/bin/kubectl))') elif isinstance(cloud, clouds.RunPod): commands.append( f'echo -en "\\r{prefix_str}RunPod{empty_str}" && ' From 8a54fafa374887b8f1407267a18aa066fa103192 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 18:31:46 -0700 Subject: [PATCH 77/85] add skip_status_check --- sky/backends/backend_utils.py | 12 ++++++++++-- sky/serve/core.py | 4 +++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index f7a8a29e981..b0856ed1909 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2721,13 +2721,20 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str, def get_endpoints(cluster: str, - port: Optional[Union[int, str]] = None) -> Dict[int, str]: + port: Optional[Union[int, str]] = None, + skip_status_check: bool = False) -> Dict[int, str]: """Gets the endpoint for a given cluster and port number (endpoint). Args: cluster: The name of the cluster. port: The port number to get the endpoint for. If None, endpoints for all ports are returned. + skip_status_check: Whether to skip the status check for the cluster. + This is useful when the cluster is known to be in a INIT state + and the caller wants to query the endpoints. Used by serve + controller to query endpoints during cluster launch when multiple + services may be getting launched in parallel (and as a result, + the controller may be in INIT status due to a concurrent launch). Returns: A dictionary of port numbers to endpoints. If endpoint is None, the dictionary will contain all ports:endpoints exposed on the cluster. @@ -2751,7 +2758,8 @@ def get_endpoints(cluster: str, refresh=False, cluster_names=[cluster]) cluster_record = cluster_records[0] - if cluster_record['status'] != status_lib.ClusterStatus.UP: + if (not skip_status_check and + cluster_record['status'] != status_lib.ClusterStatus.UP): with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError( f'Cluster {cluster_record["name"]!r} ' diff --git a/sky/serve/core.py b/sky/serve/core.py index dba1eade5e7..9680b90de0c 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -255,7 +255,9 @@ def up( lb_port = serve_utils.load_service_initialization_result( lb_port_payload) endpoint = backend_utils.get_endpoints( - controller_handle.cluster_name, lb_port)[lb_port] + controller_handle.cluster_name, lb_port, + skip_status_check=True).get(lb_port) + assert endpoint is not None, 'Did not get endpoint for controller.' sky_logging.print( f'{fore.CYAN}Service name: ' From 5fb42322182ef955f10a472f339c903208b97a0b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 20:52:11 -0700 Subject: [PATCH 78/85] remove zone requirement --- tests/skyserve/auto_restart.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index 0b440753902..2a3a31051b9 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -8,7 +8,6 @@ service: resources: ports: 8080 cloud: gcp - zone: us-central1-a cpus: 2+ workdir: examples/serve/http_server From 2e8afe9967b94fd38b7c698d1816626183b5f4a0 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 22:01:35 -0700 Subject: [PATCH 79/85] fix timings for test --- tests/test_smoke.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9a819a7dc25..895eac9b83d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3407,6 +3407,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str): @pytest.mark.serve +@pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP def test_skyserve_load_balancer(generic_cloud: str): """Test skyserve load balancer round-robin policy""" name = _get_service_name() @@ -3573,7 +3574,7 @@ def test_skyserve_fast_update(generic_cloud: str): f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', f'sky serve update {name} --cloud {generic_cloud} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', # sleep to wait for update to be registered. - 'sleep 120', + 'sleep 30', # 2 on-deamnd (ready) + 1 on-demand (provisioning). ( _check_replica_in_status( @@ -3587,7 +3588,7 @@ def test_skyserve_fast_update(generic_cloud: str): # Test rolling update f'sky serve update {name} --cloud {generic_cloud} -y tests/skyserve/update/bump_version_before.yaml', # sleep to wait for update to be registered. - 'sleep 30', + 'sleep 15', # 2 on-deamnd (ready) + 1 on-demand (shutting down). _check_replica_in_status(name, [(2, False, 'READY'), (1, False, 'SHUTTING_DOWN')]), From 736e087820c4c62aadc98fdf82aa0d8d5dad1d07 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:02:34 -0700 Subject: [PATCH 80/85] silence curl download --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 4087d2d984c..9908fa54286 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -242,7 +242,7 @@ def _get_cloud_dependencies_installation_commands( 'fi" && ' # Install kubectl '(command -v kubectl &>/dev/null || ' - '(curl -LO "https://dl.k8s.io/release/' + '(curl -s -LO "https://dl.k8s.io/release/' '$(curl -L -s https://dl.k8s.io/release/stable.txt)' '/bin/linux/amd64/kubectl" && ' 'sudo install -o root -g root -m 0755 ' From e3768b41c4fd786533874c9fb661e49122290a60 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:12:01 -0700 Subject: [PATCH 81/85] move jq from yaml to test_minimal --- tests/test_smoke.py | 2 ++ tests/test_yamls/minimal.yaml | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 895eac9b83d..efea1353b4e 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -284,6 +284,8 @@ def test_minimal(generic_cloud: str): # Ensure the raylet process has the correct file descriptor limit. f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', f'sky logs {name} 2 --status', # Ensure the job succeeded. + # Install jq for the next test. + f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', # Check the cluster info f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. diff --git a/tests/test_yamls/minimal.yaml b/tests/test_yamls/minimal.yaml index 33210f829bf..d7f9d4482f2 100644 --- a/tests/test_yamls/minimal.yaml +++ b/tests/test_yamls/minimal.yaml @@ -2,7 +2,6 @@ name: min setup: | echo "running setup" - sudo apt-get install -y jq run: | conda env list From 26cc3803a41e54540ef03685a5c52654f35a918a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:23:46 -0700 Subject: [PATCH 82/85] move jq from yaml to test_minimal --- tests/test_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index efea1353b4e..c0469abb109 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -288,9 +288,9 @@ def test_minimal(generic_cloud: str): f'sky exec {name} \'sudo apt-get update && sudo apt-get install -y jq\'', # Check the cluster info f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'', - f'sky logs {name} 3 --status', # Ensure the job succeeded. - f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', f'sky logs {name} 4 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'', + f'sky logs {name} 5 --status', # Ensure the job succeeded. ], f'sky down -y {name}', _get_timeout(generic_cloud), From 3b7b935c92cf39f45a671bdfce21ae754e9d2b3f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:34:51 -0700 Subject: [PATCH 83/85] add assert --- sky/backends/cloud_vm_ray_backend.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 48489f73a30..1c7ad0e0b59 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4063,6 +4063,11 @@ def set_autostop(self, if idle_minutes_to_autostop is not None: # Skip auto-stop for Kubernetes clusters. if isinstance(handle.launched_resources.cloud, clouds.Kubernetes): + # We should hit this code path only for the jobs controller on + # Kubernetes clusters. + assert (controller_utils.Controllers.from_name( + handle.cluster_name) == + controller_utils.Controllers.JOBS_CONTROLLER) logger.info('Auto-stop is not supported for Kubernetes ' 'clusters. Skipping.') return From 76589641dce37ad7b433f0520c505af22509d10e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:40:02 -0700 Subject: [PATCH 84/85] lint --- sky/backends/cloud_vm_ray_backend.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 1c7ad0e0b59..6ff611d342c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1994,8 +1994,12 @@ def provision_with_retries( requested_features = self._requested_features.copy() # Skip stop feature for Kubernetes jobs controller. - if isinstance(to_provision.cloud, clouds.Kubernetes) and controller_utils.Controllers.from_name( - cluster_name) == controller_utils.Controllers.JOBS_CONTROLLER: + if isinstance(to_provision.cloud, clouds.Kubernetes + ) and controller_utils.Controllers.from_name( + cluster_name + ) == controller_utils.Controllers.JOBS_CONTROLLER: + assert (clouds.CloudImplementationFeatures.STOP in + requested_features), requested_features requested_features.remove( clouds.CloudImplementationFeatures.STOP) @@ -4066,8 +4070,8 @@ def set_autostop(self, # We should hit this code path only for the jobs controller on # Kubernetes clusters. assert (controller_utils.Controllers.from_name( - handle.cluster_name) == - controller_utils.Controllers.JOBS_CONTROLLER) + handle.cluster_name) == controller_utils.Controllers. + JOBS_CONTROLLER), handle.cluster_name logger.info('Auto-stop is not supported for Kubernetes ' 'clusters. Skipping.') return From 4e3a3cd0f4d2e25dcea57d8b61039b343b12952f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 23:41:36 -0700 Subject: [PATCH 85/85] lint --- sky/backends/cloud_vm_ray_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6ff611d342c..e17845f4989 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1998,8 +1998,8 @@ def provision_with_retries( ) and controller_utils.Controllers.from_name( cluster_name ) == controller_utils.Controllers.JOBS_CONTROLLER: - assert (clouds.CloudImplementationFeatures.STOP in - requested_features), requested_features + assert (clouds.CloudImplementationFeatures.STOP + in requested_features), requested_features requested_features.remove( clouds.CloudImplementationFeatures.STOP)