From 6fc56bee115ab979ed3c51b8b62d4eacb49b5f0d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 02:58:10 +0000 Subject: [PATCH 1/8] Wait until endpoint to be ready for k8s --- sky/provision/kubernetes/network.py | 4 +++- sky/provision/kubernetes/network_utils.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index 875547e7677..dbd87e53df7 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -223,7 +223,9 @@ def _query_ports_for_loadbalancer( cluster_name_on_cloud=cluster_name_on_cloud) external_ip = network_utils.get_loadbalancer_ip( namespace=provider_config.get('namespace', 'default'), - service_name=service_name) + service_name=service_name, + timeout=10, + ) if external_ip is None: return {} diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 836d75af41f..7e543f7ea84 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -1,5 +1,6 @@ """Kubernetes network provisioning utils.""" import os +import time from typing import Dict, List, Optional, Tuple, Union import jinja2 @@ -222,7 +223,9 @@ def get_ingress_external_ip_and_ports( return external_ip, None -def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: +def get_loadbalancer_ip(namespace: str, + service_name: str, + timeout: int = 0) -> Optional[str]: """Returns the IP address of the load balancer.""" core_api = kubernetes.core_api() service = core_api.read_namespaced_service( @@ -233,6 +236,12 @@ def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]: ip = service.status.load_balancer.ingress[ 0].ip or service.status.load_balancer.ingress[0].hostname + start_time = time.time() + while ip is None and time.time() - start_time < timeout: + service = core_api.read_namespaced_service( + service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) + ip = (service.status.load_balancer.ingress[0].ip or + service.status.load_balancer.ingress[0].hostname) return ip if ip is not None else None From 44ba799afe590ea79dcafbd43afbf7861b69105e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 05:24:43 +0000 Subject: [PATCH 2/8] fix --- sky/provision/kubernetes/network.py | 8 +++++++- sky/provision/kubernetes/network_utils.py | 18 ++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index dbd87e53df7..f3e02f31312 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -1,6 +1,7 @@ """Kubernetes network provisioning.""" from typing import Any, Dict, List, Optional +from sky import sky_logging from sky.adaptors import kubernetes from sky.provision import common from sky.provision.kubernetes import network_utils @@ -8,6 +9,8 @@ from sky.utils import kubernetes_enums from sky.utils.resources_utils import port_ranges_to_set +logger = sky_logging.init_logger(__name__) + _PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}' _LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb' @@ -218,13 +221,16 @@ def _query_ports_for_loadbalancer( ports: List[int], provider_config: Dict[str, Any], ) -> Dict[int, List[common.Endpoint]]: + logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}') result: Dict[int, List[common.Endpoint]] = {} service_name = _LOADBALANCER_SERVICE_NAME.format( cluster_name_on_cloud=cluster_name_on_cloud) external_ip = network_utils.get_loadbalancer_ip( namespace=provider_config.get('namespace', 'default'), service_name=service_name, - timeout=10, + # Timeout is set so that we can retry the query when the + # cluster is firstly created and the load balancer is not ready yet. + timeout=30, ) if external_ip is None: diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 7e543f7ea84..501ba26825f 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -8,12 +8,15 @@ import sky from sky import exceptions +from sky import sky_logging from sky import skypilot_config from sky.adaptors import kubernetes from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import kubernetes_enums from sky.utils import ux_utils +logger = sky_logging.init_logger(__name__) + _INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2' _LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2' @@ -228,20 +231,19 @@ def get_loadbalancer_ip(namespace: str, timeout: int = 0) -> Optional[str]: """Returns the IP address of the load balancer.""" core_api = kubernetes.core_api() - service = core_api.read_namespaced_service( - service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - if service.status.load_balancer.ingress is None: - return None + ip = None - ip = service.status.load_balancer.ingress[ - 0].ip or service.status.load_balancer.ingress[0].hostname start_time = time.time() while ip is None and time.time() - start_time < timeout: service = core_api.read_namespaced_service( service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - ip = (service.status.load_balancer.ingress[0].ip or - service.status.load_balancer.ingress[0].hostname) + if service.status.load_balancer.ingress is not None: + ip = (service.status.load_balancer.ingress[0].ip or + service.status.load_balancer.ingress[0].hostname) + if ip is None: + logger.debug('Waiting for load balancer IP to be assigned.') + time.sleep(1) return ip if ip is not None else None From 1eb36b5687e412af36b0371f3d83b5067b6f3cc1 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 05:33:48 +0000 Subject: [PATCH 3/8] Less debug output --- sky/provision/kubernetes/network_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 501ba26825f..c99b873b264 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -235,6 +235,7 @@ def get_loadbalancer_ip(namespace: str, ip = None start_time = time.time() + retry_cnt = 0 while ip is None and time.time() - start_time < timeout: service = core_api.read_namespaced_service( service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) @@ -242,7 +243,10 @@ def get_loadbalancer_ip(namespace: str, ip = (service.status.load_balancer.ingress[0].ip or service.status.load_balancer.ingress[0].hostname) if ip is None: - logger.debug('Waiting for load balancer IP to be assigned.') + retry_cnt += 1 + if retry_cnt % 5 == 0: + logger.debug('Waiting for load balancer IP to be assigned' + '...') time.sleep(1) return ip if ip is not None else None From 30b05beafbb08c3de4bb277d302c5b852d766170 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 05:47:23 +0000 Subject: [PATCH 4/8] ux --- sky/utils/controller_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index c1859d52663..235d661bf18 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands( prefix_str = 'Check & install cloud dependencies on controller: ' # This is to make sure the shorter checking message does not have junk # characters from the previous message. - empty_str = ' ' * 5 + empty_str = ' ' * 10 aws_dependencies_installation = ( 'pip list | grep boto3 > /dev/null 2>&1 || pip install ' 'botocore>=1.29.10 boto3>=1.26.1; ' From c2b1a30e757f1265e67d1bd32d69ac577da4d3f7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 06:27:14 +0000 Subject: [PATCH 5/8] fix --- sky/provision/kubernetes/network.py | 2 +- sky/provision/kubernetes/network_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index f3e02f31312..e4b267e8ab3 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -230,7 +230,7 @@ def _query_ports_for_loadbalancer( service_name=service_name, # Timeout is set so that we can retry the query when the # cluster is firstly created and the load balancer is not ready yet. - timeout=30, + timeout=60, ) if external_ip is None: diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index c99b873b264..c3e2f617f73 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -248,7 +248,7 @@ def get_loadbalancer_ip(namespace: str, logger.debug('Waiting for load balancer IP to be assigned' '...') time.sleep(1) - return ip if ip is not None else None + return ip def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]: From b12a8c344298d4eb452bba65b8a1545fe444009b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 3 Jul 2024 18:43:26 +0000 Subject: [PATCH 6/8] address comments --- sky/provision/__init__.py | 4 ++++ sky/provision/kubernetes/network_utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 8371fb8ad83..0fe4ab614ce 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -155,6 +155,10 @@ def query_ports( return the endpoint without querying the cloud provider. If head_ip is not provided, the cloud provider will be queried to get the endpoint info. + The underlying implementation is responsible for retries and timeout, e.g. + kubernetes will wait for the service that expose the ports to be ready + before returning the endpoint info. + Returns a dict with port as the key and a list of common.Endpoint. """ del provider_name, provider_config, cluster_name_on_cloud # unused diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 41134b4318c..844f84a04f5 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -253,7 +253,7 @@ def get_loadbalancer_ip(namespace: str, start_time = time.time() retry_cnt = 0 - while ip is None and time.time() - start_time < timeout: + while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout): service = core_api.read_namespaced_service( service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) if service.status.load_balancer.ingress is not None: From 422e9f4259f0ef11d680e0e49aed0757f2e9d57e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 4 Jul 2024 03:39:57 +0000 Subject: [PATCH 7/8] Add rich status for endpoint fetching --- sky/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index b1006fe19ab..f043dedbb5e 100644 --- a/sky/core.py +++ b/sky/core.py @@ -19,6 +19,7 @@ from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import controller_utils +from sky.utils import rich_utils from sky.utils import subprocess_utils if typing.TYPE_CHECKING: @@ -126,7 +127,8 @@ def endpoints(cluster: str, RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - return backend_utils.get_endpoints(cluster=cluster, port=port) + with rich_utils.safe_status('[bold cyan]Fetching endpoints...[/]'): + return backend_utils.get_endpoints(cluster=cluster, port=port) @usage_lib.entrypoint From b04c69a329133822ee2f620d1f590d5a2c990733 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 4 Jul 2024 06:56:26 +0000 Subject: [PATCH 8/8] Add rich status for waiting for the endpoint --- sky/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index f043dedbb5e..6b18fd2c190 100644 --- a/sky/core.py +++ b/sky/core.py @@ -127,7 +127,8 @@ def endpoints(cluster: str, RuntimeError: if the cluster has no ports to be exposed or no endpoints are exposed yet. """ - with rich_utils.safe_status('[bold cyan]Fetching endpoints...[/]'): + with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster ' + f'{cluster}...[/]'): return backend_utils.get_endpoints(cluster=cluster, port=port)