Merge branch 'master' of https://github.com/skypilot-org/skypilot int…

…o azure-query-status
skypilot-org · Jun 29, 2024 · 6c92072 · 6c92072
2 parents bd8471a + 4821f70
commit 6c92072
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 18 deletions.
diff --git a/llm/vllm/README.md b/llm/vllm/README.md
@@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE
 ```bash
 sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN
 ```
+**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more.
+
 2. Check the IP for the cluster with:
 ```
 IP=$(sky status --ip vllm-llama2)

diff --git a/llm/vllm/serve-openai-api-docker.yaml b/llm/vllm/serve-openai-api-docker.yaml
@@ -0,0 +1,20 @@
+envs:
+  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+
+resources:
+  image_id: docker:vllm/vllm-openai:latest
+  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
+  ports:
+    - 8000
+
+setup: |
+  conda deactivate
+  python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+run: |
+  conda deactivate
+  echo 'Starting vllm openai api server...'
+  python -m vllm.entrypoints.openai.api_server \
+    --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
+    --host 0.0.0.0
diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py
@@ -3,8 +3,10 @@
 # pylint: disable=import-outside-toplevel
 import functools
 import threading
+import time
 
 from sky.adaptors import common
+from sky.utils import common_utils
 
 azure = common.LazyImport(
     'azure',
@@ -13,13 +15,30 @@
 _LAZY_MODULES = (azure,)
 
 _session_creation_lock = threading.RLock()
+_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5
 
 
 @common.load_lazy_modules(modules=_LAZY_MODULES)
+@functools.lru_cache()
 def get_subscription_id() -> str:
     """Get the default subscription id."""
     from azure.common import credentials
-    return credentials.get_cli_profile().get_subscription_id()
+    retry = 0
+    backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
+    while True:
+        try:
+            return credentials.get_cli_profile().get_subscription_id()
+        except Exception as e:
+            if ('Please run \'az login\' to setup account.' in str(e) and
+                    retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
+                # When there are multiple processes trying to get the
+                # subscription id, it may fail with the above error message.
+                # Retry will fix the issue.
+                retry += 1
+
+                time.sleep(backoff.current_backoff())
+                continue
+            raise
 
 
 @common.load_lazy_modules(modules=_LAZY_MODULES)
@@ -36,8 +55,8 @@ def exceptions():
     return azure_exceptions
 
 
-@functools.lru_cache()
 @common.load_lazy_modules(modules=_LAZY_MODULES)
+@functools.lru_cache()
 def get_client(name: str, subscription_id: str):
     # Sky only supports Azure CLI credential for now.
     # Increase the timeout to fix the Azure get-access-token timeout issue.

diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py
@@ -12,6 +12,7 @@
 from azure.mgmt.resource import ResourceManagementClient
 from azure.mgmt.resource.resources.models import DeploymentMode
 
+from sky.adaptors import azure
 from sky.utils import common_utils
 from sky.provision import common
 
@@ -122,17 +123,36 @@ def _configure_resource_group(config):
     create_or_update = get_azure_sdk_function(
         client=resource_client.deployments, function_name="create_or_update"
     )
-    # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an
-    # azure VM, and this can be called twice during ray down.
-    outputs = (
-        create_or_update(
-            resource_group_name=resource_group,
-            deployment_name="ray-config",
-            parameters=parameters,
-        )
-        .result()
-        .properties.outputs
+    # Skip creating or updating the deployment if the deployment already exists
+    # and the cluster name is the same.
+    get_deployment = get_azure_sdk_function(
+        client=resource_client.deployments, function_name="get"
     )
+    deployment_exists = False
+    try:
+        deployment = get_deployment(
+            resource_group_name=resource_group, deployment_name="ray-config"
+        )
+        logger.info("Deployment already exists. Skipping deployment creation.")
+
+        outputs = deployment.properties.outputs
+        if outputs is not None:
+            deployment_exists = True
+    except azure.exceptions().ResourceNotFoundError:
+        deployment_exists = False
+
+    if not deployment_exists:
+        # This takes a long time (> 40 seconds), we should be careful calling
+        # this function.
+        outputs = (
+            create_or_update(
+                resource_group_name=resource_group,
+                deployment_name="ray-config",
+                parameters=parameters,
+            )
+            .result()
+            .properties.outputs
+        )
 
     # We should wait for the NSG to be created before opening any ports
     # to avoid overriding the newly-added NSG rules.

diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py
@@ -91,12 +91,17 @@ def match_tags(vm):
             return True
 
         try:
-            vms = self.compute_client.virtual_machines.list(
-                resource_group_name=self.provider_config["resource_group"]
+            vms = list(
+                self.compute_client.virtual_machines.list(
+                    resource_group_name=self.provider_config["resource_group"]
+                )
             )
-        except azure.exceptions().HttpResponseError as e:
-            if e.reason == "ResourceGroupNotFound":
-                vms = {}
+        except azure.exceptions().ResourceNotFoundError as e:
+            if "Code: ResourceGroupNotFound" in e.exc_msg:
+                logger.debug(
+                    "Resource group not found. VMs should have been terminated."
+                )
+                vms = []
             else:
                 raise
 

diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py
@@ -233,7 +233,7 @@ class Backoff:
     MULTIPLIER = 1.6
     JITTER = 0.4
 
-    def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5):
+    def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
         self._initial = True
         self._backoff = 0.0
         self._initial_backoff = initial_backoff