From 44ba799afe590ea79dcafbd43afbf7861b69105e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Jun 2024 05:24:43 +0000 Subject: [PATCH] fix --- sky/provision/kubernetes/network.py | 8 +++++++- sky/provision/kubernetes/network_utils.py | 18 ++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index dbd87e53df7..f3e02f31312 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -1,6 +1,7 @@ """Kubernetes network provisioning.""" from typing import Any, Dict, List, Optional +from sky import sky_logging from sky.adaptors import kubernetes from sky.provision import common from sky.provision.kubernetes import network_utils @@ -8,6 +9,8 @@ from sky.utils import kubernetes_enums from sky.utils.resources_utils import port_ranges_to_set +logger = sky_logging.init_logger(__name__) + _PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}' _LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb' @@ -218,13 +221,16 @@ def _query_ports_for_loadbalancer( ports: List[int], provider_config: Dict[str, Any], ) -> Dict[int, List[common.Endpoint]]: + logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}') result: Dict[int, List[common.Endpoint]] = {} service_name = _LOADBALANCER_SERVICE_NAME.format( cluster_name_on_cloud=cluster_name_on_cloud) external_ip = network_utils.get_loadbalancer_ip( namespace=provider_config.get('namespace', 'default'), service_name=service_name, - timeout=10, + # Timeout is set so that we can retry the query when the + # cluster is firstly created and the load balancer is not ready yet. + timeout=30, ) if external_ip is None: diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index 7e543f7ea84..501ba26825f 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -8,12 +8,15 @@ import sky from sky import exceptions +from sky import sky_logging from sky import skypilot_config from sky.adaptors import kubernetes from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import kubernetes_enums from sky.utils import ux_utils +logger = sky_logging.init_logger(__name__) + _INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2' _LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2' @@ -228,20 +231,19 @@ def get_loadbalancer_ip(namespace: str, timeout: int = 0) -> Optional[str]: """Returns the IP address of the load balancer.""" core_api = kubernetes.core_api() - service = core_api.read_namespaced_service( - service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - if service.status.load_balancer.ingress is None: - return None + ip = None - ip = service.status.load_balancer.ingress[ - 0].ip or service.status.load_balancer.ingress[0].hostname start_time = time.time() while ip is None and time.time() - start_time < timeout: service = core_api.read_namespaced_service( service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT) - ip = (service.status.load_balancer.ingress[0].ip or - service.status.load_balancer.ingress[0].hostname) + if service.status.load_balancer.ingress is not None: + ip = (service.status.load_balancer.ingress[0].ip or + service.status.load_balancer.ingress[0].hostname) + if ip is None: + logger.debug('Waiting for load balancer IP to be assigned.') + time.sleep(1) return ip if ip is not None else None