diff --git a/llm/vllm/README.md b/llm/vllm/README.md index 61932cd8571..e3a2befbecc 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE ```bash sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN ``` +**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more. + 2. Check the IP for the cluster with: ``` IP=$(sky status --ip vllm-llama2) diff --git a/llm/vllm/serve-openai-api-docker.yaml b/llm/vllm/serve-openai-api-docker.yaml new file mode 100644 index 00000000000..0a980092e99 --- /dev/null +++ b/llm/vllm/serve-openai-api-docker.yaml @@ -0,0 +1,20 @@ +envs: + MODEL_NAME: meta-llama/Llama-2-7b-chat-hf + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +resources: + image_id: docker:vllm/vllm-openai:latest + accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1} + ports: + - 8000 + +setup: | + conda deactivate + python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + +run: | + conda deactivate + echo 'Starting vllm openai api server...' + python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \ + --host 0.0.0.0 diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py index 44618a8f64f..6bd57bc6bec 100644 --- a/sky/adaptors/azure.py +++ b/sky/adaptors/azure.py @@ -3,8 +3,10 @@ # pylint: disable=import-outside-toplevel import functools import threading +import time from sky.adaptors import common +from sky.utils import common_utils azure = common.LazyImport( 'azure', @@ -13,13 +15,30 @@ _LAZY_MODULES = (azure,) _session_creation_lock = threading.RLock() +_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5 @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_subscription_id() -> str: """Get the default subscription id.""" from azure.common import credentials - return credentials.get_cli_profile().get_subscription_id() + retry = 0 + backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4) + while True: + try: + return credentials.get_cli_profile().get_subscription_id() + except Exception as e: + if ('Please run \'az login\' to setup account.' in str(e) and + retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID): + # When there are multiple processes trying to get the + # subscription id, it may fail with the above error message. + # Retry will fix the issue. + retry += 1 + + time.sleep(backoff.current_backoff()) + continue + raise @common.load_lazy_modules(modules=_LAZY_MODULES) @@ -36,8 +55,8 @@ def exceptions(): return azure_exceptions -@functools.lru_cache() @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_client(name: str, subscription_id: str): # Sky only supports Azure CLI credential for now. # Increase the timeout to fix the Azure get-access-token timeout issue. diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index dbd961c83b2..13ecd64a987 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -12,6 +12,7 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.utils import common_utils from sky.provision import common @@ -122,17 +123,36 @@ def _configure_resource_group(config): create_or_update = get_azure_sdk_function( client=resource_client.deployments, function_name="create_or_update" ) - # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an - # azure VM, and this can be called twice during ray down. - outputs = ( - create_or_update( - resource_group_name=resource_group, - deployment_name="ray-config", - parameters=parameters, - ) - .result() - .properties.outputs + # Skip creating or updating the deployment if the deployment already exists + # and the cluster name is the same. + get_deployment = get_azure_sdk_function( + client=resource_client.deployments, function_name="get" ) + deployment_exists = False + try: + deployment = get_deployment( + resource_group_name=resource_group, deployment_name="ray-config" + ) + logger.info("Deployment already exists. Skipping deployment creation.") + + outputs = deployment.properties.outputs + if outputs is not None: + deployment_exists = True + except azure.exceptions().ResourceNotFoundError: + deployment_exists = False + + if not deployment_exists: + # This takes a long time (> 40 seconds), we should be careful calling + # this function. + outputs = ( + create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters, + ) + .result() + .properties.outputs + ) # We should wait for the NSG to be created before opening any ports # to avoid overriding the newly-added NSG rules. diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 4783dffa2c0..b4a1c656688 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -91,12 +91,17 @@ def match_tags(vm): return True try: - vms = self.compute_client.virtual_machines.list( - resource_group_name=self.provider_config["resource_group"] + vms = list( + self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) ) - except azure.exceptions().HttpResponseError as e: - if e.reason == "ResourceGroupNotFound": - vms = {} + except azure.exceptions().ResourceNotFoundError as e: + if "Code: ResourceGroupNotFound" in e.exc_msg: + logger.debug( + "Resource group not found. VMs should have been terminated." + ) + vms = [] else: raise diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 103c834000c..a9227fb4c20 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -233,7 +233,7 @@ class Backoff: MULTIPLIER = 1.6 JITTER = 0.4 - def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5): + def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5): self._initial = True self._backoff = 0.0 self._initial_backoff = initial_backoff