Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fix-az-a10
Browse files Browse the repository at this point in the history
  • Loading branch information
cblmemo committed Jul 2, 2024
2 parents 823ed8d + b03c617 commit 64126d4
Show file tree
Hide file tree
Showing 31 changed files with 728 additions and 384 deletions.
11 changes: 10 additions & 1 deletion examples/managed_job_with_storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ workdir: ./examples

file_mounts:
~/bucket_workdir:
# Change this to the your own globally unique bucket name.
# Change this to your own globally unique bucket name.
name: sky-workdir-zhwu
source: ./examples
persistent: false
mode: COPY

/output_path:
# Change this to your own globally unique bucket name.
name: sky-output-bucket
mode: MOUNT

/imagenet-image:
source: s3://sky-imagenet-data

Expand Down Expand Up @@ -55,3 +61,6 @@ run: |
cat ~/tmpfile
cat ~/a/b/c/tmpfile
# Write to a file in the mounted bucket
echo "hello world!" > /output_path/output.txt
2 changes: 2 additions & 0 deletions llm/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE
```bash
sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN
```
**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more.

2. Check the IP for the cluster with:
```
IP=$(sky status --ip vllm-llama2)
Expand Down
20 changes: 20 additions & 0 deletions llm/vllm/serve-openai-api-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
envs:
MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

resources:
image_id: docker:vllm/vllm-openai:latest
accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
ports:
- 8000

setup: |
conda deactivate
python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
run: |
conda deactivate
echo 'Starting vllm openai api server...'
python -m vllm.entrypoints.openai.api_server \
--model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
--host 0.0.0.0
23 changes: 21 additions & 2 deletions sky/adaptors/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
# pylint: disable=import-outside-toplevel
import functools
import threading
import time

from sky.adaptors import common
from sky.utils import common_utils

azure = common.LazyImport(
'azure',
Expand All @@ -13,13 +15,30 @@
_LAZY_MODULES = (azure,)

_session_creation_lock = threading.RLock()
_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5


@common.load_lazy_modules(modules=_LAZY_MODULES)
@functools.lru_cache()
def get_subscription_id() -> str:
"""Get the default subscription id."""
from azure.common import credentials
return credentials.get_cli_profile().get_subscription_id()
retry = 0
backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
while True:
try:
return credentials.get_cli_profile().get_subscription_id()
except Exception as e:
if ('Please run \'az login\' to setup account.' in str(e) and
retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
# When there are multiple processes trying to get the
# subscription id, it may fail with the above error message.
# Retry will fix the issue.
retry += 1

time.sleep(backoff.current_backoff())
continue
raise


@common.load_lazy_modules(modules=_LAZY_MODULES)
Expand All @@ -36,8 +55,8 @@ def exceptions():
return azure_exceptions


@functools.lru_cache()
@common.load_lazy_modules(modules=_LAZY_MODULES)
@functools.lru_cache()
def get_client(name: str, subscription_id: str):
# Sky only supports Azure CLI credential for now.
# Increase the timeout to fix the Azure get-access-token timeout issue.
Expand Down
37 changes: 36 additions & 1 deletion sky/adaptors/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

# pylint: disable=import-outside-toplevel

import logging
import os

from sky.adaptors import common
from sky.sky_logging import set_logging_level
from sky.utils import env_options
from sky.utils import ux_utils

Expand All @@ -28,6 +30,33 @@
API_TIMEOUT = 5


def _decorate_methods(obj, decorator):
for attr_name in dir(obj):
attr = getattr(obj, attr_name)
if callable(attr) and not attr_name.startswith('__'):
setattr(obj, attr_name, decorator(attr))
return obj


def _api_logging_decorator(logger: str, level: int):
"""Decorator to set logging level for API calls.
This is used to suppress the verbose logging from urllib3 when calls to the
Kubernetes API timeout.
"""

def decorated_api(api):

def wrapped(*args, **kwargs):
obj = api(*args, **kwargs)
_decorate_methods(obj, set_logging_level(logger, level))
return obj

return wrapped

return decorated_api


def _load_config():
global _configured
if _configured:
Expand Down Expand Up @@ -65,15 +94,16 @@ def _load_config():
_configured = True


@_api_logging_decorator('urllib3', logging.ERROR)
def core_api():
global _core_api
if _core_api is None:
_load_config()
_core_api = kubernetes.client.CoreV1Api()

return _core_api


@_api_logging_decorator('urllib3', logging.ERROR)
def auth_api():
global _auth_api
if _auth_api is None:
Expand All @@ -83,6 +113,7 @@ def auth_api():
return _auth_api


@_api_logging_decorator('urllib3', logging.ERROR)
def networking_api():
global _networking_api
if _networking_api is None:
Expand All @@ -92,6 +123,7 @@ def networking_api():
return _networking_api


@_api_logging_decorator('urllib3', logging.ERROR)
def custom_objects_api():
global _custom_objects_api
if _custom_objects_api is None:
Expand All @@ -101,6 +133,7 @@ def custom_objects_api():
return _custom_objects_api


@_api_logging_decorator('urllib3', logging.ERROR)
def node_api():
global _node_api
if _node_api is None:
Expand All @@ -110,6 +143,7 @@ def node_api():
return _node_api


@_api_logging_decorator('urllib3', logging.ERROR)
def apps_api():
global _apps_api
if _apps_api is None:
Expand All @@ -119,6 +153,7 @@ def apps_api():
return _apps_api


@_api_logging_decorator('urllib3', logging.ERROR)
def api_client():
global _api_client
if _api_client is None:
Expand Down
41 changes: 25 additions & 16 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,29 +439,38 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
f'Key {secret_name} does not exist in the cluster, creating it...')
kubernetes.core_api().create_namespaced_secret(namespace, secret)

ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
private_key_path, _ = get_or_generate_keys()
if network_mode == nodeport_mode:
ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
service_type = kubernetes_enums.KubernetesServiceType.NODEPORT
# Setup service for SSH jump pod. We create the SSH jump service here
# because we need to know the service IP address and port to set the
# ssh_proxy_command in the autoscaler config.
kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace,
service_type)
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
ssh_jump_name,
nodeport_mode,
private_key_path=private_key_path,
namespace=namespace)
elif network_mode == port_forward_mode:
# Using `kubectl port-forward` creates a direct tunnel to the pod and
# does not require a ssh jump pod.
kubernetes_utils.check_port_forward_mode_dependencies()
# Using `kubectl port-forward` creates a direct tunnel to jump pod and
# does not require opening any ports on Kubernetes nodes. As a result,
# the service can be a simple ClusterIP service which we access with
# `kubectl port-forward`.
service_type = kubernetes_enums.KubernetesServiceType.CLUSTERIP
# TODO(romilb): This can be further optimized. Instead of using the
# head node as a jump pod for worker nodes, we can also directly
# set the ssh_target to the worker node. However, that requires
# changes in the downstream code to return a mapping of node IPs to
# pod names (to be used as ssh_target) and updating the upstream
# SSHConfigHelper to use a different ProxyCommand for each pod.
# This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
# on GKE.
ssh_target = config['cluster_name'] + '-head'
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
ssh_target, port_forward_mode, private_key_path=private_key_path)
else:
# This should never happen because we check for this in from_str above.
raise ValueError(f'Unsupported networking mode: {network_mode_str}')
# Setup service for SSH jump pod. We create the SSH jump service here
# because we need to know the service IP address and port to set the
# ssh_proxy_command in the autoscaler config.
kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type)

ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace,
clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_PATH,
clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_TEMPLATE)

config['auth']['ssh_proxy_command'] = ssh_proxy_cmd

return config
Expand Down
6 changes: 6 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,6 +1251,12 @@ def ssh_credential_from_yaml(
ssh_private_key = auth_section.get('ssh_private_key')
ssh_control_name = config.get('cluster_name', '__default__')
ssh_proxy_command = auth_section.get('ssh_proxy_command')

# Update the ssh_user placeholder in proxy command, if required
if (ssh_proxy_command is not None and
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
ssh_proxy_command = ssh_proxy_command.replace(
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
credentials = {
'ssh_user': ssh_user,
'ssh_private_key': ssh_private_key,
Expand Down
31 changes: 7 additions & 24 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3065,7 +3065,10 @@ def _update_after_cluster_provisioned(
)
usage_lib.messages.usage.update_final_cluster_status(
status_lib.ClusterStatus.UP)
auth_config = common_utils.read_yaml(handle.cluster_yaml)['auth']
auth_config = backend_utils.ssh_credential_from_yaml(
handle.cluster_yaml,
ssh_user=handle.ssh_user,
docker_user=handle.docker_user)
backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
ip_list, auth_config,
ssh_port_list,
Expand Down Expand Up @@ -3885,22 +3888,8 @@ def teardown_no_lock(self,
self.post_teardown_cleanup(handle, terminate, purge)
return

if terminate and isinstance(cloud, clouds.Azure):
# Here we handle termination of Azure by ourselves instead of Ray
# autoscaler.
resource_group = config['provider']['resource_group']
terminate_cmd = f'az group delete -y --name {resource_group}'
with rich_utils.safe_status(f'[bold cyan]Terminating '
f'[green]{cluster_name}'):
returncode, stdout, stderr = log_lib.run_with_log(
terminate_cmd,
log_abs_path,
shell=True,
stream_logs=False,
require_outputs=True)

elif (isinstance(cloud, clouds.IBM) and terminate and
prev_cluster_status == status_lib.ClusterStatus.STOPPED):
if (isinstance(cloud, clouds.IBM) and terminate and
prev_cluster_status == status_lib.ClusterStatus.STOPPED):
# pylint: disable= W0622 W0703 C0415
from sky.adaptors import ibm
from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
Expand Down Expand Up @@ -4018,14 +4007,8 @@ def teardown_no_lock(self,
# never launched and the errors are related to pre-launch
# configurations (such as VPC not found). So it's safe & good UX
# to not print a failure message.
#
# '(ResourceGroupNotFound)': this indicates the resource group on
# Azure is not found. That means the cluster is already deleted
# on the cloud. So it's safe & good UX to not print a failure
# message.
elif ('TPU must be specified.' not in stderr and
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
'(ResourceGroupNotFound)' not in stderr):
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
raise RuntimeError(
_TEARDOWN_FAILURE_MESSAGE.format(
extra_reason='',
Expand Down
Loading

0 comments on commit 64126d4

Please sign in to comment.