Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[K8s] Wait until endpoint to be ready for --endpoint call #3634

Merged
merged 9 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sky/provision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ def query_ports(
return the endpoint without querying the cloud provider. If head_ip is not
provided, the cloud provider will be queried to get the endpoint info.

The underlying implementation is responsible for retries and timeout, e.g.
kubernetes will wait for the service that expose the ports to be ready
before returning the endpoint info.

Returns a dict with port as the key and a list of common.Endpoint.
"""
del provider_name, provider_config, cluster_name_on_cloud # unused
Expand Down
10 changes: 9 additions & 1 deletion sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Kubernetes network provisioning."""
from typing import Any, Dict, List, Optional

from sky import sky_logging
from sky.adaptors import kubernetes
from sky.provision import common
from sky.provision.kubernetes import network_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils.resources_utils import port_ranges_to_set

logger = sky_logging.init_logger(__name__)

_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'

Expand Down Expand Up @@ -218,12 +221,17 @@ def _query_ports_for_loadbalancer(
ports: List[int],
provider_config: Dict[str, Any],
) -> Dict[int, List[common.Endpoint]]:
logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
result: Dict[int, List[common.Endpoint]] = {}
service_name = _LOADBALANCER_SERVICE_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud)
external_ip = network_utils.get_loadbalancer_ip(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name)
service_name=service_name,
# Timeout is set so that we can retry the query when the
# cluster is firstly created and the load balancer is not ready yet.
timeout=60,
)

if external_ip is None:
return {}
Expand Down
31 changes: 23 additions & 8 deletions sky/provision/kubernetes/network_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
"""Kubernetes network provisioning utils."""
import os
import time
from typing import Dict, List, Optional, Tuple, Union

import jinja2
import yaml

import sky
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import kubernetes
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils import ux_utils

logger = sky_logging.init_logger(__name__)

_INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
_LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'

Expand Down Expand Up @@ -239,18 +243,29 @@ def get_ingress_external_ip_and_ports(
return external_ip, None


def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]:
def get_loadbalancer_ip(namespace: str,
service_name: str,
timeout: int = 0) -> Optional[str]:
"""Returns the IP address of the load balancer."""
core_api = kubernetes.core_api()
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)

if service.status.load_balancer.ingress is None:
return None
ip = None

ip = service.status.load_balancer.ingress[
0].ip or service.status.load_balancer.ingress[0].hostname
return ip if ip is not None else None
start_time = time.time()
retry_cnt = 0
while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout):
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
if service.status.load_balancer.ingress is not None:
ip = (service.status.load_balancer.ingress[0].ip or
service.status.load_balancer.ingress[0].hostname)
if ip is None:
retry_cnt += 1
if retry_cnt % 5 == 0:
logger.debug('Waiting for load balancer IP to be assigned'
'...')
time.sleep(1)
return ip


def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands(
prefix_str = 'Check & install cloud dependencies on controller: '
# This is to make sure the shorter checking message does not have junk
# characters from the previous message.
empty_str = ' ' * 5
empty_str = ' ' * 10
aws_dependencies_installation = (
'pip list | grep boto3 > /dev/null 2>&1 || pip install '
'botocore>=1.29.10 boto3>=1.26.1; '
Expand Down
Loading