Skip to content

Commit

Permalink
[K8s] Wait until endpoint to be ready for --endpoint call (#3634)
Browse files Browse the repository at this point in the history
* Wait until endpoint to be ready for k8s

* fix

* Less debug output

* ux

* fix

* address comments

* Add rich status  for endpoint fetching

* Add rich status for waiting for the endpoint
  • Loading branch information
Michaelvll authored Jul 4, 2024
1 parent f7cd5ad commit 92f55a4
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 11 deletions.
5 changes: 4 additions & 1 deletion sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sky.skylet import job_lib
from sky.usage import usage_lib
from sky.utils import controller_utils
from sky.utils import rich_utils
from sky.utils import subprocess_utils

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -126,7 +127,9 @@ def endpoints(cluster: str,
RuntimeError: if the cluster has no ports to be exposed or no endpoints
are exposed yet.
"""
return backend_utils.get_endpoints(cluster=cluster, port=port)
with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster '
f'{cluster}...[/]'):
return backend_utils.get_endpoints(cluster=cluster, port=port)


@usage_lib.entrypoint
Expand Down
4 changes: 4 additions & 0 deletions sky/provision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ def query_ports(
return the endpoint without querying the cloud provider. If head_ip is not
provided, the cloud provider will be queried to get the endpoint info.
The underlying implementation is responsible for retries and timeout, e.g.
kubernetes will wait for the service that expose the ports to be ready
before returning the endpoint info.
Returns a dict with port as the key and a list of common.Endpoint.
"""
del provider_name, provider_config, cluster_name_on_cloud # unused
Expand Down
10 changes: 9 additions & 1 deletion sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Kubernetes network provisioning."""
from typing import Any, Dict, List, Optional

from sky import sky_logging
from sky.adaptors import kubernetes
from sky.provision import common
from sky.provision.kubernetes import network_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils.resources_utils import port_ranges_to_set

logger = sky_logging.init_logger(__name__)

_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'

Expand Down Expand Up @@ -218,12 +221,17 @@ def _query_ports_for_loadbalancer(
ports: List[int],
provider_config: Dict[str, Any],
) -> Dict[int, List[common.Endpoint]]:
logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
result: Dict[int, List[common.Endpoint]] = {}
service_name = _LOADBALANCER_SERVICE_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud)
external_ip = network_utils.get_loadbalancer_ip(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name)
service_name=service_name,
# Timeout is set so that we can retry the query when the
# cluster is firstly created and the load balancer is not ready yet.
timeout=60,
)

if external_ip is None:
return {}
Expand Down
31 changes: 23 additions & 8 deletions sky/provision/kubernetes/network_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
"""Kubernetes network provisioning utils."""
import os
import time
from typing import Dict, List, Optional, Tuple, Union

import jinja2
import yaml

import sky
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import kubernetes
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils import ux_utils

logger = sky_logging.init_logger(__name__)

_INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
_LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'

Expand Down Expand Up @@ -239,18 +243,29 @@ def get_ingress_external_ip_and_ports(
return external_ip, None


def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]:
def get_loadbalancer_ip(namespace: str,
service_name: str,
timeout: int = 0) -> Optional[str]:
"""Returns the IP address of the load balancer."""
core_api = kubernetes.core_api()
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)

if service.status.load_balancer.ingress is None:
return None
ip = None

ip = service.status.load_balancer.ingress[
0].ip or service.status.load_balancer.ingress[0].hostname
return ip if ip is not None else None
start_time = time.time()
retry_cnt = 0
while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout):
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
if service.status.load_balancer.ingress is not None:
ip = (service.status.load_balancer.ingress[0].ip or
service.status.load_balancer.ingress[0].hostname)
if ip is None:
retry_cnt += 1
if retry_cnt % 5 == 0:
logger.debug('Waiting for load balancer IP to be assigned'
'...')
time.sleep(1)
return ip


def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands(
prefix_str = 'Check & install cloud dependencies on controller: '
# This is to make sure the shorter checking message does not have junk
# characters from the previous message.
empty_str = ' ' * 5
empty_str = ' ' * 10
aws_dependencies_installation = (
'pip list | grep boto3 > /dev/null 2>&1 || pip install '
'botocore>=1.29.10 boto3>=1.26.1; '
Expand Down

0 comments on commit 92f55a4

Please sign in to comment.