diff --git a/sky/core.py b/sky/core.py index b1006fe19ab..6b18fd2c190 100644 --- a/sky/core.py +++ b/sky/core.py @@ -19,6 +19,7 @@ from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import controller_utils +from sky.utils import rich_utils from sky.utils import subprocess_utils if typing.TYPE_CHECKING: @@ -126,7 +127,9 @@ def endpoints(cluster: str, RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - return backend_utils.get_endpoints(cluster=cluster, port=port) + with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster ' + f'{cluster}...[/]'): + return backend_utils.get_endpoints(cluster=cluster, port=port) @usage_lib.entrypoint diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 8371fb8ad83..0fe4ab614ce 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -155,6 +155,10 @@ def query_ports( return the endpoint without querying the cloud provider. If head_ip is not provided, the cloud provider will be queried to get the endpoint info. + The underlying implementation is responsible for retries and timeout, e.g. + kubernetes will wait for the service that expose the ports to be ready + before returning the endpoint info. + Returns a dict with port as the key and a list of common.Endpoint. """ del provider_name, provider_config, cluster_name_on_cloud # unused diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 875547e7677..e4b267e8ab3 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -1,6 +1,7 @@ """Kubernetes network provisioning.""" from typing import Any, Dict, List, Optional +from sky import sky_logging from sky.adaptors import kubernetes from sky.provision import common from sky.provision.kubernetes import network_utils @@ -8,6 +9,8 @@ from sky.utils import kubernetes_enums from sky.utils.resources_utils import port_ranges_to_set +logger = sky_logging.init_logger(__name__) + _PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}' _LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb' @@ -218,12 +221,17 @@ def _query_ports_for_loadbalancer( ports: List[int], provider_config: Dict[str, Any], ) -> Dict[int, List[common.Endpoint]]: + logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}') result: Dict[int, List[common.Endpoint]] = {} service_name = _LOADBALANCER_SERVICE_NAME.format( cluster_name_on_cloud=cluster_name_on_cloud) external_ip = network_utils.get_loadbalancer_ip( namespace=provider_config.get('namespace', 'default'), - service_name=service_name) + service_name=service_name, + # Timeout is set so that we can retry the query when the + # cluster is firstly created and the load balancer is not ready yet. + timeout=60, + ) if external_ip is None: return {} diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index c42ffee2f1c..844f84a04f5 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -1,5 +1,6 @@ """Kubernetes network provisioning utils.""" import os +import time from typing import Dict, List, Optional, Tuple, Union import jinja2 @@ -7,12 +8,15 @@ import sky from sky import exceptions +from sky import sky_logging from sky import skypilot_config from sky.adaptors import kubernetes from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import kubernetes_enums from sky.utils import ux_utils +logger = sky_logging.init_logger(__name__) + _INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2' _LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2' @@ -239,18 +243,29 @@ def get_ingress_external_ip_and_ports( return external_ip, None -def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: +def get_loadbalancer_ip(namespace: str, + service_name: str, + timeout: int = 0) -> Optional[str]: """Returns the IP address of the load balancer.""" core_api = kubernetes.core_api() - service = core_api.read_namespaced_service( - service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - if service.status.load_balancer.ingress is None: - return None + ip = None - ip = service.status.load_balancer.ingress[ - 0].ip or service.status.load_balancer.ingress[0].hostname - return ip if ip is not None else None + start_time = time.time() + retry_cnt = 0 + while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout): + service = core_api.read_namespaced_service( + service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) + if service.status.load_balancer.ingress is not None: + ip = (service.status.load_balancer.ingress[0].ip or + service.status.load_balancer.ingress[0].hostname) + if ip is None: + retry_cnt += 1 + if retry_cnt % 5 == 0: + logger.debug('Waiting for load balancer IP to be assigned' + '...') + time.sleep(1) + return ip def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]: diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index c1859d52663..235d661bf18 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands( prefix_str = 'Check & install cloud dependencies on controller: ' # This is to make sure the shorter checking message does not have junk # characters from the previous message. - empty_str = ' ' * 5 + empty_str = ' ' * 10 aws_dependencies_installation = ( 'pip list | grep boto3 > /dev/null 2>&1 || pip install ' 'botocore>=1.29.10 boto3>=1.26.1; '