Merge remote-tracking branch 'origin/master' into fix-az-a10

skypilot-org · Jul 2, 2024 · 64126d4 · 64126d4
2 parents 823ed8d + b03c617
commit 64126d4
Show file tree

Hide file tree

Showing 31 changed files with 728 additions and 384 deletions.
diff --git a/examples/managed_job_with_storage.yaml b/examples/managed_job_with_storage.yaml
@@ -15,11 +15,17 @@ workdir: ./examples
 
 file_mounts:
   ~/bucket_workdir:
-    # Change this to the your own globally unique bucket name.
+    # Change this to your own globally unique bucket name.
     name: sky-workdir-zhwu
     source: ./examples
     persistent: false
     mode: COPY
+
+  /output_path:
+    # Change this to your own globally unique bucket name.
+    name: sky-output-bucket
+    mode: MOUNT
+
   /imagenet-image:
     source: s3://sky-imagenet-data
 
@@ -55,3 +61,6 @@ run: |
 
   cat ~/tmpfile
   cat ~/a/b/c/tmpfile
+  
+  # Write to a file in the mounted bucket
+  echo "hello world!" > /output_path/output.txt
diff --git a/llm/vllm/README.md b/llm/vllm/README.md
@@ -33,6 +33,8 @@ sky launch -c vllm-llama2 serve-openai-api.yaml --env HF_TOKEN=YOUR_HUGGING_FACE
 ```bash
 sky launch -c vllm-llama2 serve-openai-api.yaml --gpus V100:1 --env HF_TOKEN=YOUR_HUGGING_FACE_API_TOKEN
 ```
+**Tip**: You can also use the vLLM docker container for faster setup. Refer to [serve-openai-api-docker.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/serve-openai-api-docker.yaml) for more.
+
 2. Check the IP for the cluster with:
 ```
 IP=$(sky status --ip vllm-llama2)

diff --git a/llm/vllm/serve-openai-api-docker.yaml b/llm/vllm/serve-openai-api-docker.yaml
@@ -0,0 +1,20 @@
+envs:
+  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+
+resources:
+  image_id: docker:vllm/vllm-openai:latest
+  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
+  ports:
+    - 8000
+
+setup: |
+  conda deactivate
+  python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+run: |
+  conda deactivate
+  echo 'Starting vllm openai api server...'
+  python -m vllm.entrypoints.openai.api_server \
+    --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
+    --host 0.0.0.0
diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py
@@ -3,8 +3,10 @@
 # pylint: disable=import-outside-toplevel
 import functools
 import threading
+import time
 
 from sky.adaptors import common
+from sky.utils import common_utils
 
 azure = common.LazyImport(
     'azure',
@@ -13,13 +15,30 @@
 _LAZY_MODULES = (azure,)
 
 _session_creation_lock = threading.RLock()
+_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5
 
 
 @common.load_lazy_modules(modules=_LAZY_MODULES)
+@functools.lru_cache()
 def get_subscription_id() -> str:
     """Get the default subscription id."""
     from azure.common import credentials
-    return credentials.get_cli_profile().get_subscription_id()
+    retry = 0
+    backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
+    while True:
+        try:
+            return credentials.get_cli_profile().get_subscription_id()
+        except Exception as e:
+            if ('Please run \'az login\' to setup account.' in str(e) and
+                    retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
+                # When there are multiple processes trying to get the
+                # subscription id, it may fail with the above error message.
+                # Retry will fix the issue.
+                retry += 1
+
+                time.sleep(backoff.current_backoff())
+                continue
+            raise
 
 
 @common.load_lazy_modules(modules=_LAZY_MODULES)
@@ -36,8 +55,8 @@ def exceptions():
     return azure_exceptions
 
 
-@functools.lru_cache()
 @common.load_lazy_modules(modules=_LAZY_MODULES)
+@functools.lru_cache()
 def get_client(name: str, subscription_id: str):
     # Sky only supports Azure CLI credential for now.
     # Increase the timeout to fix the Azure get-access-token timeout issue.

diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py
@@ -2,9 +2,11 @@
 
 # pylint: disable=import-outside-toplevel
 
+import logging
 import os
 
 from sky.adaptors import common
+from sky.sky_logging import set_logging_level
 from sky.utils import env_options
 from sky.utils import ux_utils
 
@@ -28,6 +30,33 @@
 API_TIMEOUT = 5
 
 
+def _decorate_methods(obj, decorator):
+    for attr_name in dir(obj):
+        attr = getattr(obj, attr_name)
+        if callable(attr) and not attr_name.startswith('__'):
+            setattr(obj, attr_name, decorator(attr))
+    return obj
+
+
+def _api_logging_decorator(logger: str, level: int):
+    """Decorator to set logging level for API calls.
+
+    This is used to suppress the verbose logging from urllib3 when calls to the
+    Kubernetes API timeout.
+    """
+
+    def decorated_api(api):
+
+        def wrapped(*args, **kwargs):
+            obj = api(*args, **kwargs)
+            _decorate_methods(obj, set_logging_level(logger, level))
+            return obj
+
+        return wrapped
+
+    return decorated_api
+
+
 def _load_config():
     global _configured
     if _configured:
@@ -65,15 +94,16 @@ def _load_config():
     _configured = True
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def core_api():
     global _core_api
     if _core_api is None:
         _load_config()
         _core_api = kubernetes.client.CoreV1Api()
-
     return _core_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def auth_api():
     global _auth_api
     if _auth_api is None:
@@ -83,6 +113,7 @@ def auth_api():
     return _auth_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def networking_api():
     global _networking_api
     if _networking_api is None:
@@ -92,6 +123,7 @@ def networking_api():
     return _networking_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def custom_objects_api():
     global _custom_objects_api
     if _custom_objects_api is None:
@@ -101,6 +133,7 @@ def custom_objects_api():
     return _custom_objects_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def node_api():
     global _node_api
     if _node_api is None:
@@ -110,6 +143,7 @@ def node_api():
     return _node_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def apps_api():
     global _apps_api
     if _apps_api is None:
@@ -119,6 +153,7 @@ def apps_api():
     return _apps_api
 
 
+@_api_logging_decorator('urllib3', logging.ERROR)
 def api_client():
     global _api_client
     if _api_client is None:

diff --git a/sky/authentication.py b/sky/authentication.py
@@ -439,29 +439,38 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
             f'Key {secret_name} does not exist in the cluster, creating it...')
         kubernetes.core_api().create_namespaced_secret(namespace, secret)
 
-    ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
+    private_key_path, _ = get_or_generate_keys()
     if network_mode == nodeport_mode:
+        ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
         service_type = kubernetes_enums.KubernetesServiceType.NODEPORT
+        # Setup service for SSH jump pod. We create the SSH jump service here
+        # because we need to know the service IP address and port to set the
+        # ssh_proxy_command in the autoscaler config.
+        kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace,
+                                            service_type)
+        ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
+            ssh_jump_name,
+            nodeport_mode,
+            private_key_path=private_key_path,
+            namespace=namespace)
     elif network_mode == port_forward_mode:
+        # Using `kubectl port-forward` creates a direct tunnel to the pod and
+        # does not require a ssh jump pod.
         kubernetes_utils.check_port_forward_mode_dependencies()
-        # Using `kubectl port-forward` creates a direct tunnel to jump pod and
-        # does not require opening any ports on Kubernetes nodes. As a result,
-        # the service can be a simple ClusterIP service which we access with
-        # `kubectl port-forward`.
-        service_type = kubernetes_enums.KubernetesServiceType.CLUSTERIP
+        # TODO(romilb): This can be further optimized. Instead of using the
+        #   head node as a jump pod for worker nodes, we can also directly
+        #   set the ssh_target to the worker node. However, that requires
+        #   changes in the downstream code to return a mapping of node IPs to
+        #   pod names (to be used as ssh_target) and updating the upstream
+        #   SSHConfigHelper to use a different ProxyCommand for each pod.
+        #   This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
+        #   on GKE.
+        ssh_target = config['cluster_name'] + '-head'
+        ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
+            ssh_target, port_forward_mode, private_key_path=private_key_path)
     else:
         # This should never happen because we check for this in from_str above.
         raise ValueError(f'Unsupported networking mode: {network_mode_str}')
-    # Setup service for SSH jump pod. We create the SSH jump service here
-    # because we need to know the service IP address and port to set the
-    # ssh_proxy_command in the autoscaler config.
-    kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type)
-
-    ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
-        PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace,
-        clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_PATH,
-        clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_TEMPLATE)
-
     config['auth']['ssh_proxy_command'] = ssh_proxy_cmd
 
     return config

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -1251,6 +1251,12 @@ def ssh_credential_from_yaml(
     ssh_private_key = auth_section.get('ssh_private_key')
     ssh_control_name = config.get('cluster_name', '__default__')
     ssh_proxy_command = auth_section.get('ssh_proxy_command')
+
+    # Update the ssh_user placeholder in proxy command, if required
+    if (ssh_proxy_command is not None and
+            constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
+        ssh_proxy_command = ssh_proxy_command.replace(
+            constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
     credentials = {
         'ssh_user': ssh_user,
         'ssh_private_key': ssh_private_key,

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -3065,7 +3065,10 @@ def _update_after_cluster_provisioned(
             )
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.UP)
-            auth_config = common_utils.read_yaml(handle.cluster_yaml)['auth']
+            auth_config = backend_utils.ssh_credential_from_yaml(
+                handle.cluster_yaml,
+                ssh_user=handle.ssh_user,
+                docker_user=handle.docker_user)
             backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
                                                       ip_list, auth_config,
                                                       ssh_port_list,
@@ -3885,22 +3888,8 @@ def teardown_no_lock(self,
                 self.post_teardown_cleanup(handle, terminate, purge)
             return
 
-        if terminate and isinstance(cloud, clouds.Azure):
-            # Here we handle termination of Azure by ourselves instead of Ray
-            # autoscaler.
-            resource_group = config['provider']['resource_group']
-            terminate_cmd = f'az group delete -y --name {resource_group}'
-            with rich_utils.safe_status(f'[bold cyan]Terminating '
-                                        f'[green]{cluster_name}'):
-                returncode, stdout, stderr = log_lib.run_with_log(
-                    terminate_cmd,
-                    log_abs_path,
-                    shell=True,
-                    stream_logs=False,
-                    require_outputs=True)
-
-        elif (isinstance(cloud, clouds.IBM) and terminate and
-              prev_cluster_status == status_lib.ClusterStatus.STOPPED):
+        if (isinstance(cloud, clouds.IBM) and terminate and
+                prev_cluster_status == status_lib.ClusterStatus.STOPPED):
             # pylint: disable= W0622 W0703 C0415
             from sky.adaptors import ibm
             from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
@@ -4018,14 +4007,8 @@ def teardown_no_lock(self,
             #   never launched and the errors are related to pre-launch
             #   configurations (such as VPC not found). So it's safe & good UX
             #   to not print a failure message.
-            #
-            # '(ResourceGroupNotFound)': this indicates the resource group on
-            #   Azure is not found. That means the cluster is already deleted
-            #   on the cloud. So it's safe & good UX to not print a failure
-            #   message.
             elif ('TPU must be specified.' not in stderr and
-                  'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
-                  '(ResourceGroupNotFound)' not in stderr):
+                  'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
                 raise RuntimeError(
                     _TEARDOWN_FAILURE_MESSAGE.format(
                         extra_reason='',