Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
Browse files Browse the repository at this point in the history
…o azure-query-status
  • Loading branch information
Michaelvll committed Jun 29, 2024
2 parents bd8471a + 4821f70 commit 6c92072
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 18 deletions.
2 changes: 2 additions & 0 deletions llm/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE
```bash
sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN
```
**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more.

2. Check the IP for the cluster with:
```
IP=$(sky status --ip vllm-llama2)
Expand Down
20 changes: 20 additions & 0 deletions llm/vllm/serve-openai-api-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
envs:
MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

resources:
image_id: docker:vllm/vllm-openai:latest
accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
ports:
- 8000

setup: |
conda deactivate
python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
run: |
conda deactivate
echo 'Starting vllm openai api server...'
python -m vllm.entrypoints.openai.api_server \
--model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
--host 0.0.0.0
23 changes: 21 additions & 2 deletions sky/adaptors/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
# pylint: disable=import-outside-toplevel
import functools
import threading
import time

from sky.adaptors import common
from sky.utils import common_utils

azure = common.LazyImport(
'azure',
Expand All @@ -13,13 +15,30 @@
_LAZY_MODULES = (azure,)

_session_creation_lock = threading.RLock()
_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5


@common.load_lazy_modules(modules=_LAZY_MODULES)
@functools.lru_cache()
def get_subscription_id() -> str:
"""Get the default subscription id."""
from azure.common import credentials
return credentials.get_cli_profile().get_subscription_id()
retry = 0
backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
while True:
try:
return credentials.get_cli_profile().get_subscription_id()
except Exception as e:
if ('Please run \'az login\' to setup account.' in str(e) and
retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
# When there are multiple processes trying to get the
# subscription id, it may fail with the above error message.
# Retry will fix the issue.
retry += 1

time.sleep(backoff.current_backoff())
continue
raise


@common.load_lazy_modules(modules=_LAZY_MODULES)
Expand All @@ -36,8 +55,8 @@ def exceptions():
return azure_exceptions


@functools.lru_cache()
@common.load_lazy_modules(modules=_LAZY_MODULES)
@functools.lru_cache()
def get_client(name: str, subscription_id: str):
# Sky only supports Azure CLI credential for now.
# Increase the timeout to fix the Azure get-access-token timeout issue.
Expand Down
40 changes: 30 additions & 10 deletions sky/skylet/providers/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.resource.resources.models import DeploymentMode

from sky.adaptors import azure
from sky.utils import common_utils
from sky.provision import common

Expand Down Expand Up @@ -122,17 +123,36 @@ def _configure_resource_group(config):
create_or_update = get_azure_sdk_function(
client=resource_client.deployments, function_name="create_or_update"
)
# TODO (skypilot): this takes a long time (> 40 seconds) for stopping an
# azure VM, and this can be called twice during ray down.
outputs = (
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-config",
parameters=parameters,
)
.result()
.properties.outputs
# Skip creating or updating the deployment if the deployment already exists
# and the cluster name is the same.
get_deployment = get_azure_sdk_function(
client=resource_client.deployments, function_name="get"
)
deployment_exists = False
try:
deployment = get_deployment(
resource_group_name=resource_group, deployment_name="ray-config"
)
logger.info("Deployment already exists. Skipping deployment creation.")

outputs = deployment.properties.outputs
if outputs is not None:
deployment_exists = True
except azure.exceptions().ResourceNotFoundError:
deployment_exists = False

if not deployment_exists:
# This takes a long time (> 40 seconds), we should be careful calling
# this function.
outputs = (
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-config",
parameters=parameters,
)
.result()
.properties.outputs
)

# We should wait for the NSG to be created before opening any ports
# to avoid overriding the newly-added NSG rules.
Expand Down
15 changes: 10 additions & 5 deletions sky/skylet/providers/azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,17 @@ def match_tags(vm):
return True

try:
vms = self.compute_client.virtual_machines.list(
resource_group_name=self.provider_config["resource_group"]
vms = list(
self.compute_client.virtual_machines.list(
resource_group_name=self.provider_config["resource_group"]
)
)
except azure.exceptions().HttpResponseError as e:
if e.reason == "ResourceGroupNotFound":
vms = {}
except azure.exceptions().ResourceNotFoundError as e:
if "Code: ResourceGroupNotFound" in e.exc_msg:
logger.debug(
"Resource group not found. VMs should have been terminated."
)
vms = []
else:
raise

Expand Down
2 changes: 1 addition & 1 deletion sky/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ class Backoff:
MULTIPLIER = 1.6
JITTER = 0.4

def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5):
def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
self._initial = True
self._backoff = 0.0
self._initial_backoff = initial_backoff
Expand Down

0 comments on commit 6c92072

Please sign in to comment.