Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[K8s] Wait until endpoint to be ready for --endpoint call #3634

Merged
merged 9 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Kubernetes network provisioning."""
from typing import Any, Dict, List, Optional

from sky import sky_logging
from sky.adaptors import kubernetes
from sky.provision import common
from sky.provision.kubernetes import network_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils.resources_utils import port_ranges_to_set

logger = sky_logging.init_logger(__name__)

_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'

Expand Down Expand Up @@ -218,12 +221,17 @@ def _query_ports_for_loadbalancer(
ports: List[int],
provider_config: Dict[str, Any],
) -> Dict[int, List[common.Endpoint]]:
logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
result: Dict[int, List[common.Endpoint]] = {}
service_name = _LOADBALANCER_SERVICE_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud)
external_ip = network_utils.get_loadbalancer_ip(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name)
service_name=service_name,
# Timeout is set so that we can retry the query when the
# cluster is firstly created and the load balancer is not ready yet.
timeout=60,
)

if external_ip is None:
return {}
Expand Down
31 changes: 23 additions & 8 deletions sky/provision/kubernetes/network_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
"""Kubernetes network provisioning utils."""
import os
import time
from typing import Dict, List, Optional, Tuple, Union

import jinja2
import yaml

import sky
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import kubernetes
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils import ux_utils

logger = sky_logging.init_logger(__name__)

_INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
_LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'

Expand Down Expand Up @@ -239,18 +243,29 @@ def get_ingress_external_ip_and_ports(
return external_ip, None


def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]:
def get_loadbalancer_ip(namespace: str,
service_name: str,
timeout: int = 0) -> Optional[str]:
"""Returns the IP address of the load balancer."""
core_api = kubernetes.core_api()
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)

if service.status.load_balancer.ingress is None:
return None
ip = None

ip = service.status.load_balancer.ingress[
0].ip or service.status.load_balancer.ingress[0].hostname
return ip if ip is not None else None
start_time = time.time()
retry_cnt = 0
while ip is None and time.time() - start_time < timeout:
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
if service.status.load_balancer.ingress is not None:
ip = (service.status.load_balancer.ingress[0].ip or
service.status.load_balancer.ingress[0].hostname)
if ip is None:
retry_cnt += 1
if retry_cnt % 5 == 0:
logger.debug('Waiting for load balancer IP to be assigned'
'...')
time.sleep(1)
return ip
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following the end-to-end argument, I'm thinking this retry + timeout functionality is better implemented higher up in the stack (perhaps in backend_utils.get_endpoints when provision_lib.query_ports is called). E.g., it might also be required for other port query methods/clouds in the future.

wdyt?

Copy link
Collaborator Author

@Michaelvll Michaelvll Jul 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this issue happen to ingress mode as well? Adding to the higher level makes sense to me, when the issue is general. Otherwise, it may introduce unnecessary overheads for retrying before erroring out, when the ports actually fail to expose. Wdyt?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see - I'm okay with having this check here then :) Maybe we can make a note in provision_lib.query_ports doc str that the underlying implementation is responsible for retries and timeout.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! Updated.



def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _get_cloud_dependencies_installation_commands(
prefix_str = 'Check & install cloud dependencies on controller: '
# This is to make sure the shorter checking message does not have junk
# characters from the previous message.
empty_str = ' ' * 5
empty_str = ' ' * 10
aws_dependencies_installation = (
'pip list | grep boto3 > /dev/null 2>&1 || pip install '
'botocore>=1.29.10 boto3>=1.26.1; '
Expand Down
Loading