Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
Browse files Browse the repository at this point in the history
…o docs-k8s-ts-disk

# Conflicts:
#	docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
  • Loading branch information
romilbhardwaj committed Jul 4, 2024
2 parents a9da0af + 4c6abac commit f8c8fad
Show file tree
Hide file tree
Showing 14 changed files with 189 additions and 44 deletions.
31 changes: 16 additions & 15 deletions docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,25 +311,26 @@ Fluidstack
Cudo Compute
~~~~~~~~~~~~~~~~~~

`Cudo Compute <https://www.cudocompute.com/>`__ GPU cloud provides low cost GPUs powered with green energy.
1. Create a billing account by following `this guide <https://www.cudocompute.com/docs/guide/billing/>`__.
2. Create a project `<https://www.cudocompute.com/docs/guide/projects/>`__.
3. Create an API Key by following `this guide <https://www.cudocompute.com/docs/guide/api-keys/>`__.
3. Download and install the `cudoctl <https://www.cudocompute.com/docs/cli-tool/>`__ command line tool
3. Run :code:`cudoctl init`:
`Cudo Compute <https://www.cudocompute.com/>`__ provides low cost GPUs powered by green energy.

.. code-block:: shell
1. Create a `billing account <https://www.cudocompute.com/docs/guide/billing/>`__.
2. Create a `project <https://www.cudocompute.com/docs/guide/projects/>`__.
3. Create an `API Key <https://www.cudocompute.com/docs/guide/api-keys/>`__.
4. Download and install the `cudoctl <https://www.cudocompute.com/docs/cli-tool/>`__ command line tool
5. Run :code:`cudoctl init`:

.. code-block:: shell
cudoctl init
✔ api key: my-api-key
✔ project: my-project
✔ billing account: my-billing-account
✔ context: default
config file saved ~/.config/cudo/cudo.yml
cudoctl init
✔ api key: my-api-key
✔ project: my-project
✔ billing account: my-billing-account
✔ context: default
config file saved ~/.config/cudo/cudo.yml
pip install "cudo-compute>=0.1.10"
pip install "cudo-compute>=0.1.10"
If you want to want to use skypilot with a different Cudo Compute account or project, just run :code:`cudoctl init`: again.
If you want to want to use SkyPilot with a different Cudo Compute account or project, run :code:`cudoctl init` again.



Expand Down
11 changes: 10 additions & 1 deletion examples/managed_job_with_storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ workdir: ./examples

file_mounts:
~/bucket_workdir:
# Change this to the your own globally unique bucket name.
# Change this to your own globally unique bucket name.
name: sky-workdir-zhwu
source: ./examples
persistent: false
mode: COPY

/output_path:
# Change this to your own globally unique bucket name.
name: sky-output-bucket
mode: MOUNT

/imagenet-image:
source: s3://sky-imagenet-data

Expand Down Expand Up @@ -55,3 +61,6 @@ run: |
cat ~/tmpfile
cat ~/a/b/c/tmpfile
# Write to a file in the mounted bucket
echo "hello world!" > /output_path/output.txt
4 changes: 4 additions & 0 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ class Cudo(clouds.Cloud):
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
('Docker image is currently not supported on Cudo. You can try '
'running docker command inside the `run` section in task.yaml.'),
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: (
'Cudo Compute cannot host a controller as it does not '
'autostopping, which will leave the controller to run indefinitely.'
),
}
_MAX_CLUSTER_NAME_LEN_LIMIT = 60

Expand Down
5 changes: 4 additions & 1 deletion sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sky.skylet import job_lib
from sky.usage import usage_lib
from sky.utils import controller_utils
from sky.utils import rich_utils
from sky.utils import subprocess_utils

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -126,7 +127,9 @@ def endpoints(cluster: str,
RuntimeError: if the cluster has no ports to be exposed or no endpoints
are exposed yet.
"""
return backend_utils.get_endpoints(cluster=cluster, port=port)
with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster '
f'{cluster}...[/]'):
return backend_utils.get_endpoints(cluster=cluster, port=port)


@usage_lib.entrypoint
Expand Down
4 changes: 4 additions & 0 deletions sky/provision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ def query_ports(
return the endpoint without querying the cloud provider. If head_ip is not
provided, the cloud provider will be queried to get the endpoint info.
The underlying implementation is responsible for retries and timeout, e.g.
kubernetes will wait for the service that expose the ports to be ready
before returning the endpoint info.
Returns a dict with port as the key and a list of common.Endpoint.
"""
del provider_name, provider_config, cluster_name_on_cloud # unused
Expand Down
10 changes: 9 additions & 1 deletion sky/provision/aws/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,15 @@ def open_ports(
range(existing_rule['FromPort'], existing_rule['ToPort'] + 1))
elif existing_rule['IpProtocol'] == '-1':
# For AWS, IpProtocol = -1 means all traffic
existing_ports.add(-1)
for group_pairs in existing_rule['UserIdGroupPairs']:
if group_pairs['GroupId'] != sg.id:
# We skip the port opening when the rule allows access from
# other security groups, as that is likely added by a user
# manually and satisfy their requirement.
# The security group created by SkyPilot allows all traffic
# from the same security group, which should not be skipped.
existing_ports.add(-1)
break
break

ports_to_open = []
Expand Down
3 changes: 2 additions & 1 deletion sky/provision/cudo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from sky.provision.cudo.config import bootstrap_instances
from sky.provision.cudo.instance import cleanup_ports
from sky.provision.cudo.instance import get_cluster_info
from sky.provision.cudo.instance import open_ports
from sky.provision.cudo.instance import query_instances
from sky.provision.cudo.instance import run_instances
from sky.provision.cudo.instance import stop_instances
Expand All @@ -11,4 +12,4 @@

__all__ = ('bootstrap_instances', 'run_instances', 'stop_instances',
'terminate_instances', 'wait_instances', 'get_cluster_info',
'cleanup_ports', 'query_instances')
'cleanup_ports', 'query_instances', 'open_ports')
15 changes: 12 additions & 3 deletions sky/provision/cudo/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,10 @@ def terminate_instances(
del provider_config
instances = _filter_instances(cluster_name_on_cloud, None)
for inst_id, inst in instances.items():
logger.info(f'Terminating instance {inst_id}.'
f'{inst}')
if worker_only and inst['name'].endswith('-head'):
continue
logger.info(f'Removing {inst_id}: {inst}')
logger.debug(f'Terminating Cudo instance {inst_id}.'
f'{inst}')
cudo_wrapper.remove(inst_id)


Expand Down Expand Up @@ -220,6 +219,16 @@ def query_instances(
return statuses


def open_ports(
cluster_name_on_cloud: str,
ports: List[str],
provider_config: Optional[Dict[str, Any]] = None,
) -> None:
del cluster_name_on_cloud, ports, provider_config
# Cudo has all ports open by default. Nothing to do here.
return


def cleanup_ports(
cluster_name_on_cloud: str,
ports: List[str],
Expand Down
10 changes: 9 additions & 1 deletion sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Kubernetes network provisioning."""
from typing import Any, Dict, List, Optional

from sky import sky_logging
from sky.adaptors import kubernetes
from sky.provision import common
from sky.provision.kubernetes import network_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils.resources_utils import port_ranges_to_set

logger = sky_logging.init_logger(__name__)

_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'

Expand Down Expand Up @@ -218,12 +221,17 @@ def _query_ports_for_loadbalancer(
ports: List[int],
provider_config: Dict[str, Any],
) -> Dict[int, List[common.Endpoint]]:
logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
result: Dict[int, List[common.Endpoint]] = {}
service_name = _LOADBALANCER_SERVICE_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud)
external_ip = network_utils.get_loadbalancer_ip(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name)
service_name=service_name,
# Timeout is set so that we can retry the query when the
# cluster is firstly created and the load balancer is not ready yet.
timeout=60,
)

if external_ip is None:
return {}
Expand Down
31 changes: 23 additions & 8 deletions sky/provision/kubernetes/network_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
"""Kubernetes network provisioning utils."""
import os
import time
from typing import Dict, List, Optional, Tuple, Union

import jinja2
import yaml

import sky
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import kubernetes
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
from sky.utils import ux_utils

logger = sky_logging.init_logger(__name__)

_INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
_LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'

Expand Down Expand Up @@ -239,18 +243,29 @@ def get_ingress_external_ip_and_ports(
return external_ip, None


def get_loadbalancer_ip(namespace: str, service_name: str) -> Optional[str]:
def get_loadbalancer_ip(namespace: str,
service_name: str,
timeout: int = 0) -> Optional[str]:
"""Returns the IP address of the load balancer."""
core_api = kubernetes.core_api()
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)

if service.status.load_balancer.ingress is None:
return None
ip = None

ip = service.status.load_balancer.ingress[
0].ip or service.status.load_balancer.ingress[0].hostname
return ip if ip is not None else None
start_time = time.time()
retry_cnt = 0
while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout):
service = core_api.read_namespaced_service(
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
if service.status.load_balancer.ingress is not None:
ip = (service.status.load_balancer.ingress[0].ip or
service.status.load_balancer.ingress[0].hostname)
if ip is None:
retry_cnt += 1
if retry_cnt % 5 == 0:
logger.debug('Waiting for load balancer IP to be assigned'
'...')
time.sleep(1)
return ip


def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
Expand Down
17 changes: 15 additions & 2 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,13 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
return value.replace('nvidia-tesla-', '').upper()
elif value.startswith('nvidia-'):
acc = value.replace('nvidia-', '').upper()
if acc == 'H100-80GB':
# H100 is named as H100-80GB in GKE.
if acc in ['H100-80GB', 'H100-MEGA-80GB']:
# H100 is named H100-80GB or H100-MEGA-80GB in GKE,
# where the latter has improved bandwidth.
# See a3-mega instances on GCP.
# TODO: we do not distinguish the two GPUs for simplicity,
# but we can evaluate whether we should distinguish
# them based on users' requests.
return 'H100'
return acc
else:
Expand Down Expand Up @@ -1524,6 +1529,14 @@ def create_namespace(namespace: str) -> None:
namespace: Name of the namespace to create
"""
kubernetes_client = kubernetes.kubernetes.client
try:
kubernetes.core_api().read_namespace(namespace)
except kubernetes.api_exception() as e:
if e.status != 404:
raise
else:
return

ns_metadata = dict(name=namespace, labels={'parent': 'skypilot'})
merge_custom_metadata(ns_metadata)
namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
Expand Down
28 changes: 27 additions & 1 deletion sky/skylet/providers/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

UNIQUE_ID_LEN = 4
_WAIT_NSG_CREATION_NUM_TIMEOUT_SECONDS = 600
_WAIT_FOR_RESOURCE_GROUP_DELETION_TIMEOUT_SECONDS = 480 # 8 minutes


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -80,7 +82,31 @@ def _configure_resource_group(config):
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups, function_name="create_or_update"
)
rg_create_or_update(resource_group_name=resource_group, parameters=params)
rg_creation_start = time.time()
retry = 0
while (
time.time() - rg_creation_start
< _WAIT_FOR_RESOURCE_GROUP_DELETION_TIMEOUT_SECONDS
):
try:
rg_create_or_update(resource_group_name=resource_group, parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if "ResourceGroupBeingDeleted" in str(e):
if retry % 5 == 0:
# TODO(zhwu): This should be shown in terminal for better
# UX, which will be achieved after we move Azure to use
# SkyPilot provisioner.
logger.warning(
f"Azure resource group {resource_group} of a recent "
"terminated cluster {config['cluster_name']} is being "
"deleted. It can only be provisioned after it is fully"
"deleted. Waiting..."
)
time.sleep(1)
retry += 1
continue
raise

# load the template file
current_path = Path(__file__).parent
Expand Down
Loading

0 comments on commit f8c8fad

Please sign in to comment.